//===================================================================================
// Copyright (c) 2021    Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files(the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and / or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions :
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
//
//==================================================================================
//----------------------------------------------------------------------------------
// File: BC7Encode.hlsl
//
// The Compute Shader for BC7 Encoder
//
// Copyright (c) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License.
//----------------------------------------------------------------------------------

#ifdef ASPM_GPU
#pragma warning(disable : 3078)  // "loop control variable conflicts with a previous declaration in the outer scope"
#else                            // using CPU
#include "common_def.h"
#include "bcn_common_api.h"
#include <algorithm>
#endif

// TryMode456CS
#define ENABLE_MODE4
#define ENABLE_MODE5
#define ENABLE_MODE6

// TryMode02CS
#define ENABLE_MODE0
#define ENABLE_MODE2

// TryMode137CS
#define ENABLE_MODE1
#define ENABLE_MODE3
#define ENABLE_MODE7

//#define ENABLE_CMP_MODE0
//#define ENABLE_CMP_MODE1
//#define ENABLE_CMP_MODE2
//#define ENABLE_CMP_MODE3
//#define ENABLE_CMP_MODE4
//#define ENABLE_CMP_MODE5
#define ENABLE_CMP_MODE6
//#define ENABLE_CMP_MODE7

#define ENABLE_CMP_API
#define USE_NEW_SP_ERR_IDX
#define ENABLE_CMP_REFINE_MODE6_API   // API to improve mode 6 quality
#define MAX_TRY_SHAKER  1  // used in cmp_ep_shaker

//====================================================================================
//                          HLSL Host Simulation 
//====================================================================================
// Simulate HLSL compute code on a CPU host must run single treaded 
// On cpu the code simulates a single compute unit as used by CMP DXC host

// Enable SIMULATE_GPU to run simulation in CPU using HPC in CMP GUI or CMP CLI
// Note: some bcn_encode_kernel.cpp files have specific code you simulate with, enable 
// the define USE_NEW_SINGLE_HEADER_INTERFACES and pick the external or local codec 
// to run with.

//===========================================================================
// Prototype to degug a simple simulation of shader using shared global data
// run as single thread on CPU
// #define SIMULATE_GPU
//===========================================================================

#if !defined(ASPM_GPU)

    #define THREAD_GROUP_SIZE 64
    #define BLOCK_SIZE_X 4
    #define BLOCK_SIZE_Y 4
    #define MAX_UINT 0xFFFFFFFF
    #define MIN_UINT 0x00000000
    
    // Source Texture to process
    // Texture2D g_Input;
    // Normalized 0..1
    struct Texture2D
    {
        CGU_Vec4f Texture[16];

        CGU_Vec4f Load(CGU_Vec3ui index)
        {
            CGU_INT offset;
            offset = (index.x + (index.y * 4)) & 0x0F;
            return Texture[offset];
        };

        CGU_Vec4f Load(CGU_Vec3ui index, CGU_UINT32 z)
        {
            CMP_UNUSED(z);
            CGU_INT offset;
            offset = (index.x + (index.y * 4)) & 0x0F;
            return Texture[offset];
        };

        // Ignoring z in Texture2D load
        CGU_Vec4ui Load(CGU_Vec4ui index)
        {
            CGU_INT offset;
            offset = (index.x + (index.y * 4)) & 0x0F;
            // implicit conversion of float to uint
            CGU_Vec4ui res;
            res.x = Texture[offset].x;
            res.y = Texture[offset].y;
            res.z = Texture[offset].z;
            res.w = Texture[offset].w;
            return res;
        };
    };

    // matches GPU struct in HLSL
    struct BufferShared
    {
        CGU_Vec4ui pixel;
        CGU_UINT32 error;
        CGU_UINT32 mode;
        CGU_UINT32 partition;
        CGU_UINT32 index_selector;
        CGU_UINT32 rotation;
        CGU_UINT32 pbit;
        CGU_Vec4ui endPoint_low;
        CGU_Vec4ui endPoint_high;
        CGU_Vec4ui endPoint_low_quantized;
        CGU_Vec4ui endPoint_high_quantized;
        CGU_UINT32 colorindex;
        CGU_UINT32 alphaindex;
    };

    struct SharedIOData
    {
        CGU_UINT32 error;
        CGU_UINT32 mode;
        CGU_UINT32 index_selector;
        CGU_UINT32 rotation;
        CGU_UINT32 partition;
        CGU_Vec4ui data2;
    };

    CMP_STATIC BufferShared shared_temp[THREAD_GROUP_SIZE];
    CMP_STATIC Texture2D    g_Input;

    // cbuffer input: On cpu will use 1 block
    CMP_STATIC CGU_UINT32 g_tex_width;         // Not used in HLSLHost simulation code
    CMP_STATIC CGU_UINT32 g_num_block_x = 1;
    CMP_STATIC CGU_UINT32 g_format;            // Not used in HLSLHost simulation code
    CMP_STATIC CGU_UINT32 g_mode_id        = 1;
    CMP_STATIC CGU_UINT32 g_start_block_id = 0;
    CMP_STATIC CGU_UINT32 g_num_total_blocks;
    CMP_STATIC CGU_FLOAT  g_alpha_weight = 1.0f;
    CMP_STATIC CGU_FLOAT  g_quality      = 1.0f;

    CMP_STATIC SharedIOData g_InBuff[THREAD_GROUP_SIZE];
    CMP_STATIC CGU_Vec4ui   g_OutBuff[THREAD_GROUP_SIZE];   // Used by EncodeBlocks & TryMode...
    CMP_STATIC SharedIOData g_OutBuff1[THREAD_GROUP_SIZE];  // Used by TryMode...

    // Forward definitions 
    void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
    void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
    void TryMode02CS( CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);
    void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID);

    CMP_STATIC void HLSLHost(CGU_Vec4f image_src[16])
    {
        //====================================
        // Simulate a single block CS
        //====================================
        // Load image_src
        CGU_Vec4ui imageBlock[16];
        for (CGU_INT i = 0; i < 16; i++)
        {
            g_Input.Texture[i].x = image_src[i].x / 255.0f;
            g_Input.Texture[i].y = image_src[i].y / 255.0f;
            g_Input.Texture[i].z = image_src[i].z / 255.0f;
            g_Input.Texture[i].w = image_src[i].w / 255.0f;
        }
    
        // Init global Buffers for first time use
        for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
        {
            memset(&shared_temp[i], 0, sizeof(BufferShared));
            memset(&g_InBuff[i], 0, sizeof(SharedIOData));
            memset(&g_OutBuff1[i], 0, sizeof(SharedIOData));
        }
    
        // First Shader call
        CGU_Vec3ui SV_GroupID       = {0, 0, 0};  // = Dispatch (1..(n-1),1,1) where n = number of (4x4) blocks in the image;
        CGU_Vec3ui SV_GrounThreadID = {0, 0, 0};
        g_start_block_id            = 0;
    
        //  // Global Group Memory Sync for Pixel
        //  for (CGU_INT i = 0; i < 16; i++)
        //  {
        //      CGU_Vec4f px           = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
        //      px                     = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
        //      //printf("in  px[%2d] %3.0f %3.0f %3.0f\n",i, px.x, px.y, px.z);
        //      shared_temp[i].pixel.r = (CGU_UINT32)px.r;
        //      shared_temp[i].pixel.g = (CGU_UINT32)px.g;
        //      shared_temp[i].pixel.b = (CGU_UINT32)px.b;
        //      shared_temp[i].pixel.a = (CGU_UINT32)px.a;
        //  }
    
        g_mode_id = 6;
        for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
        {
            TryMode456CS(SV_GroupIndex, SV_GroupID);
        }
    
        // Return Outbuff back to inbuff for next CS use
        for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
        {
            memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
        }
    
        // Global Group Memory Sync for Pixel
        //for (CGU_INT i = 0; i < 16; i++)
        //{
        //    CGU_Vec4f px           = g_Input.Load(CGU_Vec3ui(i % 4, i / 4, 0));
        //    px                     = cmp_clampVec4f(px * 255.0f, 0.0f, 255.0f);
        //    shared_temp[i].pixel.r = (CGU_UINT32)px.r;
        //    shared_temp[i].pixel.g = (CGU_UINT32)px.g;
        //    shared_temp[i].pixel.b = (CGU_UINT32)px.b;
        //    shared_temp[i].pixel.a = (CGU_UINT32)px.a;
        //}
    
        // Next Shader call
        g_mode_id = 1;
        for (CGU_INT SV_GroupIndex = 63; SV_GroupIndex >= 0; SV_GroupIndex--)
        {
            TryMode137CS(SV_GroupIndex, SV_GroupID);
        }
    
        // Return Outbuff back to inbuff for next shader call
        for (CGU_INT i = 0; i < THREAD_GROUP_SIZE; i++)
        {
            memcpy(&g_InBuff[i], &g_OutBuff1[i], sizeof(SharedIOData));
        }
    
        // Final Shader call
        for (CGU_INT SV_GroupIndex = 15; SV_GroupIndex >= 0; SV_GroupIndex--)
        {
            EncodeBlocks(SV_GroupIndex, SV_GroupID);
        }
    }

#endif


#ifdef ENABLE_CMP_API

// Change this to CGU_Vec4ui par_vectors42_nd[4][2];
CMP_STATIC CMP_CONSTANT CGU_UINT32 par_vectors42_nd[4][2][4] = {
    // type = 2
    {{0, 0, 0, 0}, {0, 0, 0, 0}},  // 0  {0,0}
    {{0, 0, 0, 0}, {1, 1, 1, 1}},  // 1  {0,1}
    {{1, 1, 1, 1}, {0, 0, 0, 0}},  // 2  {1,0}
    {{1, 1, 1, 1}, {1, 1, 1, 1}}   // 3  {1,1}
};

#define COMP_RED        0
#define COMP_GREEN      1
#define COMP_BLUE       2
#define COMP_ALPHA      3

typedef struct
{
    CGU_UINT32 numPartitionModes;
    CGU_UINT32 maxSubSets;
    CGU_UINT32 channels3or4;
    CGU_UINT32 bits;
    CGU_UINT32 clusters;
    CGU_UINT32 componentBits;
    CGU_UINT32 partitionBits;
    CGU_UINT32 indexBits;
} MODESETTINGS;


CMP_STATIC CMP_CONSTANT MODESETTINGS g_modesettings[8] = {
//   numPartitionModes,maxSubSets   channels3or4,   bits,   clusters,   componentBits,  partitionBits,  indexBits
    {16,                3,          3,              26,     8,          4,              4,              3},  // Mode 0
    {64,                2,          3,              37,     8,          6,              6,              3},  // Mode 1
    {64,                3,          3,              30,     4,          5,              6,              2},  // Mode 2
    {64,                2,          3,              44,     4,          7,              6,              2},  // Mode 3
    { 0,                0,          0,              0,      0,          0,              0,              2},  // Mode 4
    { 0,                0,          0,              0,      0,          0,              0,              2},  // Mode 5
    { 0,                0,          4,              58,     16,         7,              0,              4},  // Mode 6
    {64,                2,          4,              42,     4,          5,              6,              2}   // Mode 7
};

#ifndef ASPM_HLSL //=======================================================


CMP_STATIC CMP_CONSTANT CGU_UINT32 subset_mask_table2[128] = {
    // 2 subset region patterns
    0x0000CCCCu,  // 0   1100 1100 1100 1100  (MSB..LSB)
    0x00008888u,  // 1   1000 1000 1000 1000
    0x0000EEEEu,  // 2   1110 1110 1110 1110
    0x0000ECC8u,  // 3   1110 1100 1100 1000
    0x0000C880u,  // 4   1100 1000 1000 0000
    0x0000FEECu,  // 5   1111 1110 1110 1100
    0x0000FEC8u,  // 6   1111 1110 1100 1000
    0x0000EC80u,  // 7   1110 1100 1000 0000
    0x0000C800u,  // 8   1100 1000 0000 0000
    0x0000FFECu,  // 9   1111 1111 1110 1100
    0x0000FE80u,  // 10  1111 1110 1000 0000
    0x0000E800u,  // 11  1110 1000 0000 0000
    0x0000FFE8u,  // 12  1111 1111 1110 1000
    0x0000FF00u,  // 13  1111 1111 0000 0000
    0x0000FFF0u,  // 14  1111 1111 1111 0000
    0x0000F000u,  // 15  1111 0000 0000 0000
    0x0000F710u,  // 16  1111 0111 0001 0000
    0x0000008Eu,  // 17  0000 0000 1000 1110
    0x00007100u,  // 18  0111 0001 0000 0000
    0x000008CEu,  // 19  0000 1000 1100 1110
    0x0000008Cu,  // 20  0000 0000 1000 1100
    0x00007310u,  // 21  0111 0011 0001 0000
    0x00003100u,  // 22  0011 0001 0000 0000
    0x00008CCEu,  // 23  1000 1100 1100 1110
    0x0000088Cu,  // 24  0000 1000 1000 1100
    0x00003110u,  // 25  0011 0001 0001 0000
    0x00006666u,  // 26  0110 0110 0110 0110
    0x0000366Cu,  // 27  0011 0110 0110 1100
    0x000017E8u,  // 28  0001 0111 1110 1000
    0x00000FF0u,  // 29  0000 1111 1111 0000
    0x0000718Eu,  // 30  0111 0001 1000 1110
    0x0000399Cu,  // 31  0011 1001 1001 1100
    0x0000AAAAu,  // 32  1010 1010 1010 1010
    0x0000F0F0u,  // 33  1111 0000 1111 0000
    0x00005A5Au,  // 34  0101 1010 0101 1010
    0x000033CCu,  // 35  0011 0011 1100 1100
    0x00003C3Cu,  // 36  0011 1100 0011 1100
    0x000055AAu,  // 37  0101 0101 1010 1010
    0x00009696u,  // 38  1001 0110 1001 0110
    0x0000A55Au,  // 39  1010 0101 0101 1010
    0x000073CEu,  // 40  0111 0011 1100 1110
    0x000013C8u,  // 41  0001 0011 1100 1000
    0x0000324Cu,  // 42  0011 0010 0100 1100
    0x00003BDCu,  // 43  0011 1011 1101 1100
    0x00006996u,  // 44  0110 1001 1001 0110
    0x0000C33Cu,  // 45  1100 0011 0011 1100
    0x00009966u,  // 46  1001 1001 0110 0110
    0x00000660u,  // 47  0000 0110 0110 0000
    0x00000272u,  // 48  0000 0010 0111 0010
    0x000004E4u,  // 49  0000 0100 1110 0100
    0x00004E40u,  // 50  0100 1110 0100 0000
    0x00002720u,  // 51  0010 0111 0010 0000
    0x0000C936u,  // 52  1100 1001 0011 0110
    0x0000936Cu,  // 53  1001 0011 0110 1100
    0x000039C6u,  // 54  0011 1001 1100 0110
    0x0000639Cu,  // 55  0110 0011 1001 1100
    0x00009336u,  // 56  1001 0011 0011 0110
    0x00009CC6u,  // 57  1001 1100 1100 0110
    0x0000817Eu,  // 58  1000 0001 0111 1110
    0x0000E718u,  // 59  1110 0111 0001 1000
    0x0000CCF0u,  // 60  1100 1100 1111 0000
    0x00000FCCu,  // 61  0000 1111 1100 1100
    0x00007744u,  // 62  0111 0111 0100 0100
    0x0000EE22u,  // 63  1110 1110 0010 0010

    // 3 Subset region patterns
    0xF60008CCu,  // 0    1111 0110 0000 0000 : 0000 1000 1100 1100 = 2222122011001100 (MSB...LSB)
    0x73008CC8u,  // 1    0111 0011 0000 0000 : 1000 1100 1100 1000 = 1222112211001000
    0x3310CC80u,  // 2    0011 0011 0001 0000 : 1100 1100 1000 0000 = 1122112210020000
    0x00CEEC00u,  // 3    0000 0000 1100 1110 : 1110 1100 0000 0000 = 1110110022002220
    0xCC003300u,  // 4    1100 1100 0000 0000 : 0011 0011 0000 0000 = 2211221100000000
    0xCC0000CCu,  // 5    1100 1100 0000 0000 : 0000 0000 1100 1100 = 2200220011001100
    0x00CCFF00u,  // 6    0000 0000 1100 1100 : 1111 1111 0000 0000 = 1111111122002200
    0x3300CCCCu,  // 7    0011 0011 0000 0000 : 1100 1100 1100 1100 = 1122112211001100
    0xF0000F00u,  // 8    1111 0000 0000 0000 : 0000 1111 0000 0000 = 2222111100000000
    0xF0000FF0u,  // 9    1111 0000 0000 0000 : 0000 1111 1111 0000 = 2222111111110000
    0xFF0000F0u,  // 10   1111 1111 0000 0000 : 0000 0000 1111 0000 = 2222222211110000
    0x88884444u,  // 11   1000 1000 1000 1000 : 0100 0100 0100 0100 = 2100210021002100
    0x88886666u,  // 12   1000 1000 1000 1000 : 0110 0110 0110 0110 = 2110211021102110
    0xCCCC2222u,  // 13   1100 1100 1100 1100 : 0010 0010 0010 0010 = 2210221022102210
    0xEC80136Cu,  // 14   1110 1100 1000 0000 : 0001 0011 0110 1100 = 2221221121101100
    0x7310008Cu,  // 15   0111 0011 0001 0000 : 0000 0000 1000 1100 = 0222002210021100
    0xC80036C8u,  // 16   1100 1000 0000 0000 : 0011 0110 1100 1000 = 2211211011001000
    0x310008CEu,  // 17   0011 0001 0000 0000 : 0000 1000 1100 1110 = 0022100211001110
    0xCCC03330u,  // 18   1100 1100 1100 0000 : 0011 0011 0011 0000 = 2211221122110000
    0x0CCCF000u,  // 19   0000 1100 1100 1100 : 1111 0000 0000 0000 = 1111220022002200
    0xEE0000EEu,  // 20   1110 1110 0000 0000 : 0000 0000 1110 1110 = 2220222011101110
    0x77008888u,  // 21   0111 0111 0000 0000 : 1000 1000 1000 1000 = 1222122210001000
    0xCC0022C0u,  // 22   1100 1100 0000 0000 : 0010 0010 1100 0000 = 2210221011000000
    0x33004430u,  // 23   0011 0011 0000 0000 : 0100 0100 0011 0000 = 0122012200110000
    0x00CC0C22u,  // 24   0000 0000 1100 1100 : 0000 1100 0010 0010 = 0000110022102210
    0xFC880344u,  // 25   1111 1100 1000 1000 : 0000 0011 0100 0100 = 2222221121002100
    0x06606996u,  // 26   0000 0110 0110 0000 : 0110 1001 1001 0110 = 0110122112210110
    0x66009960u,  // 27   0110 0110 0000 0000 : 1001 1001 0110 0000 = 1221122101100000
    0xC88C0330u,  // 28   1100 1000 1000 1100 : 0000 0011 0011 0000 = 2200201120112200
    0xF9000066u,  // 29   1111 1001 0000 0000 : 0000 0000 0110 0110 = 2222200201100110
    0x0CC0C22Cu,  // 30   0000 1100 1100 0000 : 1100 0010 0010 1100 = 1100221022101100
    0x73108C00u,  // 31   0111 0011 0001 0000 : 1000 1100 0000 0000 = 1222112200020000
    0xEC801300u,  // 32   1110 1100 1000 0000 : 0001 0011 0000 0000 = 2221221120000000
    0x08CEC400u,  // 33   0000 1000 1100 1110 : 1100 0100 0000 0000 = 1100210022002220
    0xEC80004Cu,  // 34   1110 1100 1000 0000 : 0000 0000 0100 1100 = 2220220021001100
    0x44442222u,  // 35   0100 0100 0100 0100 : 0010 0010 0010 0010 = 0210021002100210
    0x0F0000F0u,  // 36   0000 1111 0000 0000 : 0000 0000 1111 0000 = 0000222211110000
    0x49242492u,  // 37   0100 1001 0010 0100 : 0010 0100 1001 0010 = 0210210210210210
    0x42942942u,  // 38   0100 0010 1001 0100 : 0010 1001 0100 0010 = 0210102121020210
    0x0C30C30Cu,  // 39   0000 1100 0011 0000 : 1100 0011 0000 1100 = 1100221100221100
    0x03C0C03Cu,  // 40   0000 0011 1100 0000 : 1100 0000 0011 1100 = 1100002222111100
    0xFF0000AAu,  // 41   1111 1111 0000 0000 : 0000 0000 1010 1010 = 2222222210101010
    0x5500AA00u,  // 42   0101 0101 0000 0000 : 1010 1010 0000 0000 = 1212121200000000
    0xCCCC3030u,  // 43   1100 1100 1100 1100 : 0011 0000 0011 0000 = 2211220022112200
    0x0C0CC0C0u,  // 44   0000 1100 0000 1100 : 1100 0000 1100 0000 = 1100220011002200
    0x66669090u,  // 45   0110 0110 0110 0110 : 1001 0000 1001 0000 = 1221022012210220
    0x0FF0A00Au,  // 46   0000 1111 1111 0000 : 1010 0000 0000 1010 = 1010222222221010
    0x5550AAA0u,  // 47   0101 0101 0101 0000 : 1010 1010 1010 0000 = 1212121212120000
    0xF0000AAAu,  // 48   1111 0000 0000 0000 : 0000 1010 1010 1010 = 2222101010101010
    0x0E0EE0E0u,  // 49   0000 1110 0000 1110 : 1110 0000 1110 0000 = 1110222011102220
    0x88887070u,  // 50   1000 1000 1000 1000 : 0111 0000 0111 0000 = 2111200021112000
    0x99906660u,  // 51   1001 1001 1001 0000 : 0110 0110 0110 0000 = 2112211221120000
    0xE00E0EE0u,  // 52   1110 0000 0000 1110 : 0000 1110 1110 0000 = 2220111011102220
    0x88880770u,  // 53   1000 1000 1000 1000 : 0000 0111 0111 0000 = 2000211121112000
    0xF0000666u,  // 54   1111 0000 0000 0000 : 0000 0110 0110 0110 = 2222011001100110
    0x99006600u,  // 55   1001 1001 0000 0000 : 0110 0110 0000 0000 = 2112211200000000
    0xFF000066u,  // 56   1111 1111 0000 0000 : 0000 0000 0110 0110 = 2222222201100110
    0xC00C0CC0u,  // 57   1100 0000 0000 1100 : 0000 1100 1100 0000 = 2200110011002200
    0xCCCC0330u,  // 58   1100 1100 1100 1100 : 0000 0011 0011 0000 = 2200221122112200
    0x90006000u,  // 59   1001 0000 0000 0000 : 0110 0000 0000 0000 = 2112000000000000
    0x08088080u,  // 60   0000 1000 0000 1000 : 1000 0000 1000 0000 = 1000200010002000
    0xEEEE1010u,  // 61   1110 1110 1110 1110 : 0001 0000 0001 0000 = 2221222022212220
    0xFFF0000Au,  // 62   1111 1111 1111 0000 : 0000 0000 0000 1010 = 2222222222221010
    0x731008CEu,  // 63   0111 0011 0001 0000 : 0000 1000 1100 1110 = 0222102211021110
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_npv_nd[2][8] = {
    {1, 2, 4, 8, 16, 32, 0, 0},  // 3
    {1, 2, 4, 0, 0, 0, 0, 0}     // 4
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_par_vectors_nd[2][8][64][2][4] = {
    {
        // 3D
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 0}},
         {{1, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
         {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
         {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
         {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 0, 0}, {0, 0, 0, 0}}, {{1, 0, 1, 0}, {0, 0, 0, 0}}, {{0, 1, 1, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 0, 0}},
         {{1, 1, 0, 0}, {1, 1, 0, 0}}, {{1, 0, 1, 0}, {1, 1, 0, 0}}, {{0, 1, 1, 0}, {1, 1, 0, 0}}, {{0, 0, 0, 0}, {1, 0, 1, 0}}, {{1, 1, 0, 0}, {1, 0, 1, 0}},
         {{1, 0, 1, 0}, {1, 0, 1, 0}}, {{0, 1, 1, 0}, {1, 0, 1, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 0}}, {{1, 1, 0, 0}, {0, 1, 1, 0}}, {{1, 0, 1, 0}, {0, 1, 1, 0}},
         {{0, 1, 1, 0}, {0, 1, 1, 0}}, {{1, 0, 0, 0}, {1, 1, 1, 0}}, {{0, 1, 0, 0}, {1, 1, 1, 0}}, {{0, 0, 1, 0}, {1, 1, 1, 0}}, {{1, 1, 1, 0}, {1, 1, 1, 0}},
         {{1, 0, 0, 0}, {0, 0, 1, 0}}, {{0, 1, 0, 0}, {0, 0, 1, 0}}, {{0, 0, 1, 0}, {0, 0, 1, 0}}, {{1, 1, 1, 0}, {0, 0, 1, 0}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{0, 1, 0, 0}, {1, 0, 0, 0}}, {{0, 0, 1, 0}, {1, 0, 0, 0}}, {{1, 1, 1, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {0, 1, 0, 0}}, {{0, 1, 0, 0}, {0, 1, 0, 0}},
         {{0, 0, 1, 0}, {0, 1, 0, 0}}, {{1, 1, 1, 0}, {0, 1, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
    },
    {
        // 4D
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {0, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 1, 1, 1}}, {{0, 1, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 1, 1}, {0, 1, 1, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}},
         {{1, 0, 0, 0}, {1, 1, 1, 1}}, {{1, 1, 1, 1}, {1, 0, 0, 0}}, {{1, 1, 1, 1}, {1, 1, 1, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 1, 1}},
         {{0, 0, 1, 1}, {0, 0, 0, 0}}, {{0, 1, 0, 1}, {0, 1, 0, 1}}, {{1, 0, 0, 0}, {1, 0, 0, 0}}, {{1, 0, 0, 0}, {1, 0, 1, 1}}, {{1, 0, 1, 1}, {1, 0, 0, 0}},
         {{1, 1, 0, 1}, {1, 1, 0, 1}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
        {{{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}},
         {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}, {{0, 0, 0, 0}, {0, 0, 0, 0}}},
    },
};

CMP_STATIC CMP_CONSTANT CGU_UINT8 cmp_rampI[3][16] = {
    {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0},           // 2 bit index
    {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},        // 3 bit index
    {0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64}  // 4 bit index
};

// The data is saved as a packed INT = (BC7_FIXUPINDEX1 << 4 + BC7_FIXUPINDEX2)
CMP_STATIC CMP_CONSTANT CGU_UINT32 CMPFIXUPINDEX[128] = {
    // 2 subset partitions 0..63
    0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,
    0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,
    0xf0u,0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0xf0u,
    0x20u,0x80u,0x20u,0x20u,0x80u,0x80u,0x20u,0x20u,
    0xf0u,0xf0u,0x60u,0x80u,0x20u,0x80u,0xf0u,0xf0u,
    0x20u,0x80u,0x20u,0x20u,0x20u,0xf0u,0xf0u,0x60u,
    0x60u,0x20u,0x60u,0x80u,0xf0u,0xf0u,0x20u,0x20u,
    0xf0u,0xf0u,0xf0u,0xf0u,0xf0u,0x20u,0x20u,0xf0u,

    // 3 subset partitions 64..128
    0x3fu,0x38u,0xf8u,0xf3u,0x8fu,0x3fu,0xf3u,0xf8u,
    0x8fu,0x8fu,0x6fu,0x6fu,0x6fu,0x5fu,0x3fu,0x38u,
    0x3fu,0x38u,0x8fu,0xf3u,0x3fu,0x38u,0x6fu,0xa8u,
    0x53u,0x8fu,0x86u,0x6au,0x8fu,0x5fu,0xfau,0xf8u,
    0x8fu,0xf3u,0x3fu,0x5au,0x6au,0xa8u,0x89u,0xfau,
    0xf6u,0x3fu,0xf8u,0x5fu,0xf3u,0xf6u,0xf6u,0xf8u,
    0x3fu,0xf3u,0x5fu,0x5fu,0x5fu,0x8fu,0x5fu,0xafu,
    0x5fu,0xafu,0x8fu,0xdfu,0xf3u,0xcfu,0x3fu,0x38u };


INLINE void cmp_get_fixuptable(CMP_INOUT CGU_UINT32 fixup[3], CGU_INT part_id)
{
    CGU_UINT32 skip_packed = CMPFIXUPINDEX[part_id];  // gather_int2(FIXUPINDEX, part_id);
    fixup[0]            = 0;
    fixup[1]            = skip_packed >> 4;
    fixup[2]            = skip_packed & 15;
}


INLINE CGU_UINT8 shift_right_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
{
    return v >> bits;  // (perf warning expected)
}

INLINE CGU_UINT8 expand_epocode2(CMP_IN CGU_UINT8 v, CMP_IN CGU_INT bits)
{
    CGU_UINT8 vv = v << (8 - bits);
    return vv + shift_right_epocode2(vv, bits);
}

INLINE CGV_FLOAT cmp_GetRamp(CMP_IN CGU_INT index_bits,  // ramp bits Valid range 2..4
                             CMP_IN CGU_INT bits,        // Component Valid range 5..8
                             CMP_IN CGU_INT p1,          // 0..255
                             CMP_IN CGU_INT p2,          // 0..255
                             CMP_IN CGU_UINT8 index)
{
    CGU_INT   e1    = expand_epocode2(p1, bits);
    CGU_INT   e2    = expand_epocode2(p2, bits);
    CGV_FLOAT ramp  = cmp_rampI[index_bits - 2][index] / 64.0F;
    CGV_FLOAT rampf = floor(e1 + ramp * (e2 - e1) + 0.5F);
    return rampf;
}

#if defined(USE_NEW_SP_ERR_IDX) 

#ifndef ASPM_GPU
struct BC7_EncodeRamps2
{
    CGU_INT      ep_d[4][256];
    CGU_UINT8    sp_err[3*4*256*2*2*16];
    CGU_INT      sp_idx[3*4*256*2*2*16*2];
    CGU_BOOL     ramp_init;
};

BC7_EncodeRamps2 BC7EncodeRamps2;

#define LOG_CL_RANGE2                    5
#define LOG_CL_BASE2                     2
#define BIT_BASE2                        5
#define BIT_RANGE2                       9
#define BTT2(bits)                       (bits-BIT_BASE2)
#define CLT2(cl)                         (cl-LOG_CL_BASE2)
#define SOURCE_BLOCK_SIZE                16

CMP_CONSTANT CGU_FLOAT  rampWeights2[5][SOURCE_BLOCK_SIZE] = {
    { 0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 0 bit index
    { 0.000000f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 1 bit index
    { 0.000000f,0.328125f,0.671875f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 2 bit index
    { 0.000000f,0.140625f,0.281250f,0.421875f,0.578125f,0.718750f,0.859375f,1.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f,0.000000f}, // 3 bit index
    { 0.000000f,0.062500f,0.140625f,0.203125f,0.265625f,0.328125f,0.406250f,0.468750f,0.531250f,0.593750f,0.671875f,0.734375f,0.796875f,0.859375f,0.937500f,1.000000f}  // 4 bit index
};

CGU_INT old_expandbits(CGU_INT bits, CGU_INT v)
{
    return (v << (8 - bits) | v >> (2 * bits - 8));
}

void old_init_BC7ramps()
{

    CMP_STATIC CGU_BOOL g_rampsInitialized = FALSE;
    if (g_rampsInitialized == TRUE)
        return;
    g_rampsInitialized       = TRUE;
    BC7EncodeRamps2.ramp_init = TRUE;

    //bc7_isa(); ASPM_PRINT((" INIT Ramps\n"));

    CGU_INT bits;
    CGU_INT p1;
    CGU_INT p2;
    CGU_INT clogBC7;
    CGU_INT index;
    CGU_INT j;
    CGU_INT o1;
    CGU_INT o2;

    for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
    {
        for (p1 = 0; p1 < (1 << bits); p1++)
        {
            BC7EncodeRamps2.ep_d[BTT2(bits)][p1] = old_expandbits(bits, p1);
        }  //p1
    }      //bits<BIT_RANGE

    for (clogBC7 = LOG_CL_BASE2; clogBC7 < LOG_CL_RANGE2; clogBC7++)
    {
        for (bits = BIT_BASE2; bits < BIT_RANGE2; bits++)
        {

            // SP_ERR_IDX : Init
            for (j = 0; j < 256; j++)
            {
                for (o1 = 0; o1 < 2; o1++)
                {
                    for (o2 = 0; o2 < 2; o2++)
                    {
                        for (index = 0; index < 16; index++)
                        {
                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                  (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] = 0;
                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                  (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] = 255;
                            BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) + (o1 * 2 * 16) +
                                                  (o2 * 16) + index]                                   = 255;
                        }  // i<16
                    }      //o2<2;
                }          //o1<2
            }              //j<256

            // SP_ERR_IDX : calc
            for (p1 = 0; p1 < (1 << bits); p1++)
            {
                for (p2 = 0; p2 < (1 << bits); p2++)
                {
                    for (index = 0; index < (1 << clogBC7); index++)
                    {
                        CGV_INT floatf =
                            floor((CGV_FLOAT)BC7EncodeRamps2.ep_d[BTT2(bits)][p1] +
                                  rampWeights2[clogBC7][index] * (CGV_FLOAT)((BC7EncodeRamps2.ep_d[BTT2(bits)][p2] - BC7EncodeRamps2.ep_d[BTT2(bits)][p1])) + 0.5F);
                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
                                              ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 0] = p1;
                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (floatf * 2 * 2 * 16 * 2) +
                                              ((p1 & 0x1) * 2 * 16 * 2) + ((p2 & 0x1) * 16 * 2) + (index * 2) + 1] = p2;
                        BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (floatf * 2 * 2 * 16) +
                                              ((p1 & 0x1) * 2 * 16) + (p2 & 0x1 * 16) + index]                     = 0;
                    }  //i<(1 << clogBC7)
                }      //p2
            }          //p1<(1 << bits)

            for (j = 0; j < 256; j++)
            {
                for (o1 = 0; o1 < 2; o1++)
                {
                    for (o2 = 0; o2 < 2; o2++)
                    {
                        for (index = 0; index < (1 << clogBC7); index++)
                        {
                            if (  // check for unitialized sp_idx
                                (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                       (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] == 0) &&
                                (BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) + (j * 2 * 2 * 16 * 2) +
                                                       (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] == 255))

                            {
                                CGU_INT k;
                                CGU_INT tf;
                                CGU_INT tc;

                                for (k = 1; k < 256; k++)
                                {
                                    tf = j - k;
                                    tc = j + k;
                                    if ((tf >= 0 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
                                                                          (tf * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
                                    {
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                              (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                  (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                              (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                  (tf * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 1];
                                        break;
                                    }
                                    else if ((tc < 256 && BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) +
                                                                                (tc * 2 * 2 * 16) + (o1 * 2 * 16) + (o2 * 16) + index] == 0))
                                    {
                                        BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                              (j * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0] =
                                            BC7EncodeRamps2.sp_idx[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + (BTT2(bits) * 256 * 2 * 2 * 16 * 2) +
                                                                  (tc * 2 * 2 * 16 * 2) + (o1 * 2 * 16 * 2) + (o2 * 16 * 2) + (index * 2) + 0];
                                        break;
                                    }
                                }

                                BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (j * 2 * 2 * 16) +
                                                      (o1 * 2 * 16) + (o2 * 16) + index] = (CGU_UINT8)k;

                            }  //sp_idx < 0
                        }      //i<(1 << clogBC7)
                    }          //o2
                }              //o1
            }                  //j

        }  //bits<BIT_RANGE
    }      //clogBC7<LOG_CL_RANGE
}

CGV_FLOAT old_img_absf(CGV_FLOAT a)
{
    return a > 0.0F ? a : -a;
}

INLINE CGV_FLOAT old_get_sperr(CGU_INT   clogBC7,  // ramp bits Valid range 2..4
                           CGU_INT   bits,     // Component Valid range 5..8
                           CGV_INT   p1,       // 0..255
                           CGU_INT   t1,
                           CGU_INT   t2,
                           CGV_UINT8 index)
{
  if (BC7EncodeRamps2.ramp_init)
        return BC7EncodeRamps2.sp_err[(CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16) + (BTT2(bits) * 256 * 2 * 2 * 16) + (p1 * 2 * 2 * 16) + (t1 * 2 * 16) + (t2 * 16) + index];
  else
      return 0.0f;
}
#endif

#endif

#endif // Not ASPM_HLSL

#endif // ENABLE_CMP_API

#define get_end_point_l(subset) shared_temp[threadBase + subset].endPoint_low_quantized
#define get_end_point_h(subset) shared_temp[threadBase + subset].endPoint_high_quantized
#define get_color_index(index) shared_temp[threadBase + index].error
#define get_alpha_index(index) shared_temp[threadBase + index].mode

//4 bit index: 0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64
CMP_STATIC CMP_CONSTANT CGU_UINT32 aStep[3][64] = {
    {0, 0, 0, 1, 1, 1, 1, 2, 2, 2,  2,  2,  3,  3,  3,  3,  4,  4,  4,  4,  5,  5,  5,  5,  6,  6,  6,  6,  6,  7,  7,  7,
     7, 8, 8, 8, 8, 9, 9, 9, 9, 10, 10, 10, 10, 10, 11, 11, 11, 11, 12, 12, 12, 12, 13, 13, 13, 13, 14, 14, 14, 14, 15, 15},
    //3 bit index: 0, 9, 18, 27, 37, 46, 55, 64
    {0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3,
     3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7},
    //2 bit index: 0, 21, 43, 64
    {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
     1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3}};

CMP_STATIC CMP_CONSTANT CGU_UINT32 aWeight[3][16] = {{0, 4, 9, 13, 17, 21, 26, 30, 34, 38, 43, 47, 51, 55, 60, 64},
                                                     {0, 9, 18, 27, 37, 46, 55, 64, 0, 0, 0, 0, 0, 0, 0, 0},
                                                     {0, 21, 43, 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}};

 //Associated to partition 0-63
CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions[64] = {
    0xCCCC, 0x8888, 0xEEEE, 0xECC8, 0xC880, 0xFEEC, 0xFEC8, 0xEC80, 0xC800, 0xFFEC, 0xFE80, 0xE800, 0xFFE8, 0xFF00, 0xFFF0, 0xF000,
    0xF710, 0x008E, 0x7100, 0x08CE, 0x008C, 0x7310, 0x3100, 0x8CCE, 0x088C, 0x3110, 0x6666, 0x366C, 0x17E8, 0x0FF0, 0x718E, 0x399C,
    0xaaaa, 0xf0f0, 0x5a5a, 0x33cc, 0x3c3c, 0x55aa, 0x9696, 0xa55a, 0x73ce, 0x13c8, 0x324c, 0x3bdc, 0x6996, 0xc33c, 0x9966, 0x660,
    0x272,  0x4e4,  0x4e40, 0x2720, 0xc936, 0x936c, 0x39c6, 0x639c, 0x9336, 0x9cc6, 0x817e, 0xe718, 0xccf0, 0xfcc,  0x7744, 0xee22,
};

 //Associated to partition 64-127
CMP_STATIC CMP_CONSTANT CGU_UINT32 blockPartitions2[64] = {
    0xaa685050, 0x6a5a5040, 0x5a5a4200, 0x5450a0a8, 0xa5a50000, 0xa0a05050, 0x5555a0a0, 0x5a5a5050, 0xaa550000, 0xaa555500, 0xaaaa5500,
    0x90909090, 0x94949494, 0xa4a4a4a4, 0xa9a59450, 0x2a0a4250, 0xa5945040, 0x0a425054, 0xa5a5a500, 0x55a0a0a0, 0xa8a85454, 0x6a6a4040,
    0xa4a45000, 0x1a1a0500, 0x0050a4a4, 0xaaa59090, 0x14696914, 0x69691400, 0xa08585a0, 0xaa821414, 0x50a4a450, 0x6a5a0200, 0xa9a58000,
    0x5090a0a8, 0xa8a09050, 0x24242424, 0x00aa5500, 0x24924924, 0x24499224, 0x50a50a50, 0x500aa550, 0xaaaa4444, 0x66660000, 0xa5a0a5a0,
    0x50a050a0, 0x69286928, 0x44aaaa44, 0x66666600, 0xaa444444, 0x54a854a8, 0x95809580, 0x96969600, 0xa85454a8, 0x80959580, 0xaa141414,
    0x96960000, 0xaaaa1414, 0xa05050a0, 0xa0a5a5a0, 0x96000000, 0x40804080, 0xa9a8a9a8, 0xaaaaaa44, 0x2a4a5254,
};

CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1D[128] = {
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
    { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    
    {15, 0},{15, 0},{ 6, 0},{ 8, 0},
    { 2, 0},{ 8, 0},{15, 0},{15, 0},
    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    { 2, 0},{15, 0},{15, 0},{ 6, 0},
    { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
    {15, 0},{15, 0},{ 2, 0},{ 2, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{ 2, 0},{ 2, 0},{15, 0},
    //candidateFixUpIndex1D[i][1], i < 64 should not be used
    
    { 3,15},{ 3, 8},{15, 8},{15, 3},
    { 8,15},{ 3,15},{15, 3},{15, 8},
    { 8,15},{ 8,15},{ 6,15},{ 6,15},
    { 6,15},{ 5,15},{ 3,15},{ 3, 8},
    { 3,15},{ 3, 8},{ 8,15},{15, 3},
    { 3,15},{ 3, 8},{ 6,15},{10, 8},
    { 5, 3},{ 8,15},{ 8, 6},{ 6,10},
    { 8,15},{ 5,15},{15,10},{15, 8},
    
    { 8,15},{15, 3},{ 3,15},{ 5,10},
    { 6,10},{10, 8},{ 8, 9},{15,10},
    {15, 6},{ 3,15},{15, 8},{ 5,15},
    {15, 3},{15, 6},{15, 6},{15, 8}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
    { 3,15},{15, 3},{ 5,15},{ 5,15},
    { 5,15},{ 8,15},{ 5,15},{10,15},
    { 5,15},{10,15},{ 8,15},{13,15},
    {15, 3},{12,15},{ 3,15},{ 3, 8},
};

CMP_STATIC CMP_CONSTANT CGU_Vec2ui candidateFixUpIndex1DOrdered[128] = {
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{ 2, 0},{ 8, 0},{ 2, 0},
    { 2, 0},{ 8, 0},{ 8, 0},{15, 0},
    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    { 8, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    
    {15, 0},{15, 0},{ 6, 0},{ 8, 0},
    { 2, 0},{ 8, 0},{15, 0},{15, 0},
    { 2, 0},{ 8, 0},{ 2, 0},{ 2, 0},
    { 2, 0},{15, 0},{15, 0},{ 6, 0},
    { 6, 0},{ 2, 0},{ 6, 0},{ 8, 0},
    {15, 0},{15, 0},{ 2, 0},{ 2, 0},
    {15, 0},{15, 0},{15, 0},{15, 0},
    {15, 0},{ 2, 0},{ 2, 0},{15, 0},
    //candidateFixUpIndex1DOrdered[i][1], i < 64 should not be used
    
    { 3,15},{ 3, 8},{ 8,15},{ 3,15},
    { 8,15},{ 3,15},{ 3,15},{ 8,15},
    { 8,15},{ 8,15},{ 6,15},{ 6,15},
    { 6,15},{ 5,15},{ 3,15},{ 3, 8},
    { 3,15},{ 3, 8},{ 8,15},{ 3,15},
    { 3,15},{ 3, 8},{ 6,15},{ 8,10},
    { 3, 5},{ 8,15},{ 6, 8},{ 6,10},
    { 8,15},{ 5,15},{10,15},{ 8,15},
    
    { 8,15},{ 3,15},{ 3,15},{ 5,10},
    { 6,10},{ 8,10},{ 8, 9},{10,15},
    { 6,15},{ 3,15},{ 8,15},{ 5,15},
    { 3,15},{ 6,15},{ 6,15},{ 8,15}, //The Spec doesn't mark the first fixed up index in this row, so I apply 15 for them, and seems correct
    { 3,15},{ 3,15},{ 5,15},{ 5,15},
    { 5,15},{ 8,15},{ 5,15},{10,15},
    { 5,15},{10,15},{ 8,15},{13,15},
    { 3,15},{12,15},{ 3,15},{ 3, 8}
};

CGU_Vec4ui quantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
{
    return (((color << 8) + color) * ((1 << uPrec) - 1) + 32768U) >> 16;
}

CGU_Vec4ui unquantize(CGU_Vec4ui color, CGU_UINT32 uPrec)
{
#ifdef ASPM_GPU
    color = color << (8 - uPrec);
    return color | (color >> uPrec);
#else
    CGU_Vec4ui res;
    color.x = color.x << (8 - uPrec);
    color.y = color.y << (8 - uPrec);
    color.z = color.z << (8 - uPrec);
    color.w = color.w << (8 - uPrec);
    res.x   = color.x | (color.x >> uPrec);
    res.y   = color.y | (color.y >> uPrec);
    res.z   = color.z | (color.z >> uPrec);
    res.w   = color.w | (color.w >> uPrec);
    return res;
#endif
}

void swap(CMP_INOUT CGU_Vec4ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec4ui CMP_REFINOUT rhs)
{
    CGU_Vec4ui tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

void swap(CMP_INOUT CGU_Vec3ui CMP_REFINOUT lhs, CMP_INOUT CGU_Vec3ui CMP_REFINOUT rhs)
{
    CGU_Vec3ui tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

void swap(CMP_INOUT CGU_UINT32 CMP_REFINOUT lhs, CMP_INOUT CGU_UINT32 CMP_REFINOUT rhs)
{
    CGU_UINT32 tmp = lhs;
    lhs            = rhs;
    rhs            = tmp;
}

CGU_UINT32 ComputeError(CMP_IN CGU_Vec4ui a, CMP_IN CGU_Vec4ui b)
{
    return dot(a.rgb, b.rgb) + (g_alpha_weight * a.a * b.a);
}

void Ensure_A_Is_Larger(CMP_INOUT CGU_Vec4ui CMP_REFINOUT a, CMP_INOUT CGU_Vec4ui CMP_REFINOUT b)
{
    if (a.x < b.x)
        swap(a.x, b.x);
    if (a.y < b.y)
        swap(a.y, b.y);
    if (a.z < b.z)
        swap(a.z, b.z);
    if (a.w < b.w)
        swap(a.w, b.w);
}

void compress_endpoints0(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb & 0xFFFFFFFE;
        quantized[j].rgb |= P[j];
        quantized[j].a = 0xFF;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j] <<= 3;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r = endPoint[j].r;
        rgbb.g = endPoint[j].g;
        rgbb.b = endPoint[j].b;
        rgbb.a = endPoint[j].b;

        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].r &= 0xFFFFFFFE;
        quantized[j].g &= 0xFFFFFFFE;
        quantized[j].b &= 0xFFFFFFFE;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];
        quantized[j].a = 0xFF;

        rgbb.r = quantized[j].r;
        rgbb.g = quantized[j].g;
        rgbb.b = quantized[j].b;
        rgbb.a = quantized[j].b;

        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 3;
    }
#endif
}

void compress_endpoints1(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb & 0xFFFFFFFE;
        quantized[j].rgb |= P[j];
        quantized[j].a  = 0xFF;
        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
        endPoint[j].a   = 0xFF;
        quantized[j] <<= 1;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 7).rgb;

        quantized[j].r &= 0xFFFFFFFE;
        quantized[j].g &= 0xFFFFFFFE;
        quantized[j].b &= 0xFFFFFFFE;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];
        quantized[j].a = 0xFF;

        rgbb.r          = quantized[j].r;
        rgbb.g          = quantized[j].g;
        rgbb.b          = quantized[j].b;
        rgbb.a          = quantized[j].b;
        endPoint[j].rgb = unquantize(rgbb, 7).rgb;
        endPoint[j].a   = 0xFF;
        quantized[j].r  = quantized[j].r << 1;
        quantized[j].g  = quantized[j].g << 1;
        quantized[j].b  = quantized[j].b << 1;
        quantized[j].a  = quantized[j].a << 1;
    }
#endif
}

void compress_endpoints2(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_GPU
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
        quantized[j].a   = 0xFF;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j] <<= 3;
    }
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].a   = 0xFF;

        rgbb.r = quantized[j].r;
        rgbb.g = quantized[j].g;
        rgbb.b = quantized[j].b;
        rgbb.a = quantized[j].b;

        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = 0xFF;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 3;
    }
#endif
}

void compress_endpoints3(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].r = endPoint[j].x & 0xFFFFFFFE;
        quantized[j].g = endPoint[j].y & 0xFFFFFFFE;
        quantized[j].b = endPoint[j].z & 0xFFFFFFFE;
        quantized[j].a = 0xFF;

        quantized[j].r |= P[j];
        quantized[j].g |= P[j];
        quantized[j].b |= P[j];

        endPoint[j].r = quantized[j].r;
        endPoint[j].g = quantized[j].g;
        endPoint[j].b = quantized[j].b;
        endPoint[j].a = 0xFF;
    }
}

void compress_endpoints4(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_HLSL
    [unroll] for ( uint j = 0; j < 2; j ++ )
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 5).rgb;
        quantized[j].a = quantize(endPoint[j].a, 6).r;
        
        endPoint[j].rgb = unquantize(quantized[j].rgbb, 5).rgb;        
        endPoint[j].a = unquantize(quantized[j].a, 6).r;

        quantized[j].rgb <<= 3;
        quantized[j].a <<= 2;
    }    
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 5).rgb;
        quantized[j].a   = quantize(endPoint[j].a, 6).r;

        rgbb.r          = quantized[j].r;
        rgbb.g          = quantized[j].g;
        rgbb.b          = quantized[j].b;
        rgbb.a          = quantized[j].b;
        endPoint[j].rgb = unquantize(rgbb, 5).rgb;
        endPoint[j].a   = unquantize(quantized[j].a, 6).r;

        quantized[j].r <<= 3;
        quantized[j].g <<= 3;
        quantized[j].b <<= 3;
        quantized[j].a <<= 2;
    }
#endif
}

void compress_endpoints5(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2])
{
#ifdef ASPM_HLSL
    CMP_UNROLL for ( uint j = 0; j < 2; j ++ )
    {
        quantized[j].rgb = quantize(endPoint[j].rgbb, 7).rgb;
        quantized[j].a = endPoint[j].a;

        endPoint[j].rgb = unquantize(quantized[j].rgbb, 7).rgb;
        // endPoint[j].a   Alpha is full precision

        quantized[j].rgb <<= 1;
    }   
#else
    CGU_Vec4ui rgbb;
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        rgbb.r           = endPoint[j].r;
        rgbb.g           = endPoint[j].g;
        rgbb.b           = endPoint[j].b;
        rgbb.a           = endPoint[j].b;
        quantized[j].rgb = quantize(rgbb, 7).rgb;
        quantized[j].a   = endPoint[j].a;
        rgbb.r           = quantized[j].r;
        rgbb.g           = quantized[j].g;
        rgbb.b           = quantized[j].b;
        rgbb.a           = quantized[j].b;
        endPoint[j].rgb  = unquantize(rgbb, 7).rgb;
        quantized[j].r <<= 1;
        quantized[j].g <<= 1;
        quantized[j].b <<= 1;
    }
#endif
}

void compress_endpoints6(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_OUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].x = endPoint[j].x & 0xFFFFFFFE;
        quantized[j].y = endPoint[j].y & 0xFFFFFFFE;
        quantized[j].z = endPoint[j].z & 0xFFFFFFFE;
        quantized[j].w = endPoint[j].w & 0xFFFFFFFE;
        quantized[j].x = quantized[j].x | P[j];
        quantized[j].y = quantized[j].y | P[j];
        quantized[j].z = quantized[j].z | P[j];
        quantized[j].w = quantized[j].w | P[j];
        endPoint[j]    = quantized[j];
    }
}

void compress_endpoints7(CMP_INOUT CGU_Vec4ui endPoint[2], CMP_INOUT CGU_Vec4ui quantized[2], CGU_Vec2ui P)
{
    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j] = quantize(endPoint[j], 6);

        quantized[j].x = (quantized[j].x & 0xFFFFFFFE) | P[j];
        quantized[j].y = (quantized[j].y & 0xFFFFFFFE) | P[j];
        quantized[j].z = (quantized[j].z & 0xFFFFFFFE) | P[j];
        quantized[j].w = (quantized[j].w & 0xFFFFFFFE) | P[j];

        endPoint[j] = unquantize(quantized[j], 6);
    }

    CMP_UNROLL for (CGU_UINT32 j = 0; j < 2; j++)
    {
        quantized[j].x = quantized[j].x << 2;
        quantized[j].y = quantized[j].y << 2;
        quantized[j].z = quantized[j].z << 2;
        quantized[j].w = quantized[j].w << 2;
    }
}

void block_package0(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x01 | ((partition - 64) << 1) | ((get_end_point_l(0).r & 0xF0) << 1) | ((get_end_point_h(0).r & 0xF0) << 5) |
              ((get_end_point_l(1).r & 0xF0) << 9) | ((get_end_point_h(1).r & 0xF0) << 13) | ((get_end_point_l(2).r & 0xF0) << 17) |
              ((get_end_point_h(2).r & 0xF0) << 21) | ((get_end_point_l(0).g & 0xF0) << 25);
    block.y = ((get_end_point_l(0).g & 0xF0) >> 7) | ((get_end_point_h(0).g & 0xF0) >> 3) | ((get_end_point_l(1).g & 0xF0) << 1) |
              ((get_end_point_h(1).g & 0xF0) << 5) | ((get_end_point_l(2).g & 0xF0) << 9) | ((get_end_point_h(2).g & 0xF0) << 13) |
              ((get_end_point_l(0).b & 0xF0) << 17) | ((get_end_point_h(0).b & 0xF0) << 21) | ((get_end_point_l(1).b & 0xF0) << 25);
    block.z = ((get_end_point_l(1).b & 0xF0) >> 7) | ((get_end_point_h(1).b & 0xF0) >> 3) | ((get_end_point_l(2).b & 0xF0) << 1) |
              ((get_end_point_h(2).b & 0xF0) << 5) | ((get_end_point_l(0).r & 0x08) << 10) | ((get_end_point_h(0).r & 0x08) << 11) |
              ((get_end_point_l(1).r & 0x08) << 12) | ((get_end_point_h(1).r & 0x08) << 13) | ((get_end_point_l(2).r & 0x08) << 14) |
              ((get_end_point_h(2).r & 0x08) << 15) | (get_color_index(0) << 19);
    block.w      = 0;
    CGU_UINT32 i = 1;
    for (; i <= cmp_min(candidateFixUpIndex1DOrdered[partition][0], 4); i++)
    {
        block.z |= get_color_index(i) << (i * 3 + 18);
    }
    if (candidateFixUpIndex1DOrdered[partition][0] < 4)  //i = 4
    {
        block.z |= get_color_index(4) << 29;
        i += 1;
    }
    else  //i = 5
    {
        block.w |= (get_color_index(4) & 0x04) >> 2;
        for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
            block.w |= get_color_index(i) << (i * 3 - 14);
    }
    for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
    {
        block.w |= get_color_index(i) << (i * 3 - 15);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 3 - 16);
    }
}

void block_package1(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x02 | (partition << 2) | ((get_end_point_l(0).r & 0xFC) << 6) | ((get_end_point_h(0).r & 0xFC) << 12) | ((get_end_point_l(1).r & 0xFC) << 18) |
              ((get_end_point_h(1).r & 0xFC) << 24);
    block.y = ((get_end_point_l(0).g & 0xFC) >> 2) | ((get_end_point_h(0).g & 0xFC) << 4) | ((get_end_point_l(1).g & 0xFC) << 10) |
              ((get_end_point_h(1).g & 0xFC) << 16) | ((get_end_point_l(0).b & 0xFC) << 22) | ((get_end_point_h(0).b & 0xFC) << 28);
    block.z = ((get_end_point_h(0).b & 0xFC) >> 4) | ((get_end_point_l(1).b & 0xFC) << 2) | ((get_end_point_h(1).b & 0xFC) << 8) |
              ((get_end_point_l(0).r & 0x02) << 15) | ((get_end_point_l(1).r & 0x02) << 16) | (get_color_index(0) << 18);
    if (candidateFixUpIndex1DOrdered[partition][0] == 15)
    {
        block.w = (get_color_index(15) << 30) | (get_color_index(14) << 27) | (get_color_index(13) << 24) | (get_color_index(12) << 21) |
                  (get_color_index(11) << 18) | (get_color_index(10) << 15) | (get_color_index(9) << 12) | (get_color_index(8) << 9) |
                  (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else if (candidateFixUpIndex1DOrdered[partition][0] == 2)
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
                  (get_color_index(7) << 5) | (get_color_index(6) << 2) | (get_color_index(5) >> 1);
        block.z |= (get_color_index(5) << 31) | (get_color_index(4) << 28) | (get_color_index(3) << 25) | (get_color_index(2) << 23) |
                   (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else if (candidateFixUpIndex1DOrdered[partition][0] == 8)
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 9) |
                  (get_color_index(7) << 6) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
    else  //candidateFixUpIndex1DOrdered[partition] == 6
    {
        block.w = (get_color_index(15) << 29) | (get_color_index(14) << 26) | (get_color_index(13) << 23) | (get_color_index(12) << 20) |
                  (get_color_index(11) << 17) | (get_color_index(10) << 14) | (get_color_index(9) << 11) | (get_color_index(8) << 8) |
                  (get_color_index(7) << 5) | (get_color_index(6) << 3) | get_color_index(5);
        block.z |=
            (get_color_index(4) << 29) | (get_color_index(3) << 26) | (get_color_index(2) << 23) | (get_color_index(1) << 20) | (get_color_index(0) << 18);
    }
}

void block_package2(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x04 | ((partition - 64) << 3) | ((get_end_point_l(0).r & 0xF8) << 6) | ((get_end_point_h(0).r & 0xF8) << 11) |
              ((get_end_point_l(1).r & 0xF8) << 16) | ((get_end_point_h(1).r & 0xF8) << 21) | ((get_end_point_l(2).r & 0xF8) << 26);
    block.y = ((get_end_point_l(2).r & 0xF8) >> 6) | ((get_end_point_h(2).r & 0xF8) >> 1) | ((get_end_point_l(0).g & 0xF8) << 4) |
              ((get_end_point_h(0).g & 0xF8) << 9) | ((get_end_point_l(1).g & 0xF8) << 14) | ((get_end_point_h(1).g & 0xF8) << 19) |
              ((get_end_point_l(2).g & 0xF8) << 24);
    block.z = ((get_end_point_h(2).g & 0xF8) >> 3) | ((get_end_point_l(0).b & 0xF8) << 2) | ((get_end_point_h(0).b & 0xF8) << 7) |
              ((get_end_point_l(1).b & 0xF8) << 12) | ((get_end_point_h(1).b & 0xF8) << 17) | ((get_end_point_l(2).b & 0xF8) << 22) |
              ((get_end_point_h(2).b & 0xF8) << 27);
    block.w      = ((get_end_point_h(2).b & 0xF8) >> 5) | (get_color_index(0) << 3);
    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 2);
    }
    for (; i <= candidateFixUpIndex1DOrdered[partition][1]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void block_package3(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x08 | (partition << 4) | ((get_end_point_l(0).r & 0xFE) << 9) | ((get_end_point_h(0).r & 0xFE) << 16) | ((get_end_point_l(1).r & 0xFE) << 23) |
              ((get_end_point_h(1).r & 0xFE) << 30);
    block.y = ((get_end_point_h(1).r & 0xFE) >> 2) | ((get_end_point_l(0).g & 0xFE) << 5) | ((get_end_point_h(0).g & 0xFE) << 12) |
              ((get_end_point_l(1).g & 0xFE) << 19) | ((get_end_point_h(1).g & 0xFE) << 26);
    block.z = ((get_end_point_h(1).g & 0xFE) >> 6) | ((get_end_point_l(0).b & 0xFE) << 1) | ((get_end_point_h(0).b & 0xFE) << 8) |
              ((get_end_point_l(1).b & 0xFE) << 15) | ((get_end_point_h(1).b & 0xFE) << 22) | ((get_end_point_l(0).r & 0x01) << 30) |
              ((get_end_point_h(0).r & 0x01) << 31);
    block.w = ((get_end_point_l(1).r & 0x01) << 0) | ((get_end_point_h(1).r & 0x01) << 1) | (get_color_index(0) << 2);

    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void block_package4(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 index_selector, CGU_UINT32 threadBase)
{
    block.x = 0x10 | ((rotation & 3) << 5) | ((index_selector & 1) << 7) | ((get_end_point_l(0).r & 0xF8) << 5) | ((get_end_point_h(0).r & 0xF8) << 10) |
              ((get_end_point_l(0).g & 0xF8) << 15) | ((get_end_point_h(0).g & 0xF8) << 20) | ((get_end_point_l(0).b & 0xF8) << 25);

    block.y = ((get_end_point_l(0).b & 0xF8) >> 7) | ((get_end_point_h(0).b & 0xF8) >> 2) | ((get_end_point_l(0).a & 0xFC) << 4) |
              ((get_end_point_h(0).a & 0xFC) << 10) | ((get_color_index(0) & 1) << 18) | (get_color_index(1) << 19) | (get_color_index(2) << 21) |
              (get_color_index(3) << 23) | (get_color_index(4) << 25) | (get_color_index(5) << 27) | (get_color_index(6) << 29) | (get_color_index(7) << 31);

    block.z = (get_color_index(7) >> 1) | (get_color_index(8) << 1) | (get_color_index(9) << 3) | (get_color_index(10) << 5) | (get_color_index(11) << 7) |
              (get_color_index(12) << 9) | (get_color_index(13) << 11) | (get_color_index(14) << 13) | (get_color_index(15) << 15) |
              ((get_alpha_index(0) & 3) << 17) | (get_alpha_index(1) << 19) | (get_alpha_index(2) << 22) | (get_alpha_index(3) << 25) |
              (get_alpha_index(4) << 28) | (get_alpha_index(5) << 31);

    block.w = (get_alpha_index(5) >> 1) | (get_alpha_index(6) << 2) | (get_alpha_index(7) << 5) | (get_alpha_index(8) << 8) | (get_alpha_index(9) << 11) |
              (get_alpha_index(10) << 14) | (get_alpha_index(11) << 17) | (get_alpha_index(12) << 20) | (get_alpha_index(13) << 23) |
              (get_alpha_index(14) << 26) | (get_alpha_index(15) << 29);
}

void block_package5(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 rotation, CGU_UINT32 threadBase)
{
    block.x = 0x20 | (rotation << 6) | ((get_end_point_l(0).r & 0xFE) << 7) | ((get_end_point_h(0).r & 0xFE) << 14) | ((get_end_point_l(0).g & 0xFE) << 21) |
              ((get_end_point_h(0).g & 0xFE) << 28);
    block.y = ((get_end_point_h(0).g & 0xFE) >> 4) | ((get_end_point_l(0).b & 0xFE) << 3) | ((get_end_point_h(0).b & 0xFE) << 10) |
              (get_end_point_l(0).a << 18) | (get_end_point_h(0).a << 26);
    block.z = (get_end_point_h(0).a >> 6) | (get_color_index(0) << 2) | (get_color_index(1) << 3) | (get_color_index(2) << 5) | (get_color_index(3) << 7) |
              (get_color_index(4) << 9) | (get_color_index(5) << 11) | (get_color_index(6) << 13) | (get_color_index(7) << 15) | (get_color_index(8) << 17) |
              (get_color_index(9) << 19) | (get_color_index(10) << 21) | (get_color_index(11) << 23) | (get_color_index(12) << 25) |
              (get_color_index(13) << 27) | (get_color_index(14) << 29) | (get_color_index(15) << 31);
    block.w = (get_color_index(15) >> 1) | (get_alpha_index(0) << 1) | (get_alpha_index(1) << 2) | (get_alpha_index(2) << 4) | (get_alpha_index(3) << 6) |
              (get_alpha_index(4) << 8) | (get_alpha_index(5) << 10) | (get_alpha_index(6) << 12) | (get_alpha_index(7) << 14) | (get_alpha_index(8) << 16) |
              (get_alpha_index(9) << 18) | (get_alpha_index(10) << 20) | (get_alpha_index(11) << 22) | (get_alpha_index(12) << 24) |
              (get_alpha_index(13) << 26) | (get_alpha_index(14) << 28) | (get_alpha_index(15) << 30);
}

void block_package6(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 threadBase)
{
    block.x = 0x40 | ((get_end_point_l(0).r & 0xFE) << 6) | ((get_end_point_h(0).r & 0xFE) << 13) | ((get_end_point_l(0).g & 0xFE) << 20) |
              ((get_end_point_h(0).g & 0xFE) << 27);
    block.y = ((get_end_point_h(0).g & 0xFE) >> 5) | ((get_end_point_l(0).b & 0xFE) << 2) | ((get_end_point_h(0).b & 0xFE) << 9) |
              ((get_end_point_l(0).a & 0xFE) << 16) | ((get_end_point_h(0).a & 0xFE) << 23) | (get_end_point_l(0).r & 0x01) << 31;
    block.z = (get_end_point_h(0).r & 0x01) | (get_color_index(0) << 1) | (get_color_index(1) << 4) | (get_color_index(2) << 8) | (get_color_index(3) << 12) |
              (get_color_index(4) << 16) | (get_color_index(5) << 20) | (get_color_index(6) << 24) | (get_color_index(7) << 28);
    block.w = (get_color_index(8) << 0) | (get_color_index(9) << 4) | (get_color_index(10) << 8) | (get_color_index(11) << 12) | (get_color_index(12) << 16) |
              (get_color_index(13) << 20) | (get_color_index(14) << 24) | (get_color_index(15) << 28);
}

void block_package7(CMP_OUT CGU_Vec4ui CMP_REFINOUT block, CGU_UINT32 partition, CGU_UINT32 threadBase)
{
    block.x = 0x80 | (partition << 8) | ((get_end_point_l(0).r & 0xF8) << 11) | ((get_end_point_h(0).r & 0xF8) << 16) | ((get_end_point_l(1).r & 0xF8) << 21) |
              ((get_end_point_h(1).r & 0xF8) << 26);
    block.y = ((get_end_point_h(1).r & 0xF8) >> 6) | ((get_end_point_l(0).g & 0xF8) >> 1) | ((get_end_point_h(0).g & 0xF8) << 4) |
              ((get_end_point_l(1).g & 0xF8) << 9) | ((get_end_point_h(1).g & 0xF8) << 14) | ((get_end_point_l(0).b & 0xF8) << 19) |
              ((get_end_point_h(0).b & 0xF8) << 24);
    block.z = ((get_end_point_l(1).b & 0xF8) >> 3) | ((get_end_point_h(1).b & 0xF8) << 2) | ((get_end_point_l(0).a & 0xF8) << 7) |
              ((get_end_point_h(0).a & 0xF8) << 12) | ((get_end_point_l(1).a & 0xF8) << 17) | ((get_end_point_h(1).a & 0xF8) << 22) |
              ((get_end_point_l(0).r & 0x04) << 28) | ((get_end_point_h(0).r & 0x04) << 29);
    block.w = ((get_end_point_l(1).r & 0x04) >> 2) | ((get_end_point_h(1).r & 0x04) >> 1) | (get_color_index(0) << 2);

    CGU_UINT32 i = 1;
    for (; i <= candidateFixUpIndex1DOrdered[partition][0]; i++)
    {
        block.w |= get_color_index(i) << (i * 2 + 1);
    }
    for (; i < 16; i++)
    {
        block.w |= get_color_index(i) << (i * 2);
    }
}

void GroupSync()
{
#ifdef ASPM_GPU
    GroupMemoryBarrierWithGroupSync();
#endif
}

void set_pixel_rotation(CMP_INOUT CGU_Vec4ui CMP_REFINOUT pixel, CGU_UINT32 rotation)
{
#ifdef ASPM_GPU
   if (1 == rotation)
   {
       pixel.ra = pixel.ar;
   }
   else if (2 == rotation)
   {
       pixel.ga = pixel.ag;
   }
   else if (3 == rotation)
   {
       pixel.ba = pixel.ab;
   }
#else
        CGU_UINT32 r, g, b, a;
        r = pixel.r;
        g = pixel.g;
        b = pixel.b;
        a = pixel.a;

        if (1 == rotation)
        {
            pixel.r = a;
            pixel.a = r;
        }
        else if (2 == rotation)
        {
            pixel.g = a;
            pixel.a = g;
        }
        else if (3 == rotation)
        {
            pixel.b = a;
            pixel.a = b;
        }
#endif
}

CGU_BOOL cmp_ImageHasAlpha(CGU_UINT32 threadBase)
{
#if defined(ENABLED_MODE6) || defined(ENABLE_CMP_MODE6)
    CGU_UINT32 alpha;
    for (CGU_INT ii = 0; ii < 16; ii++)
    {
        alpha = shared_temp[threadBase + ii].pixel.a;
        if ((alpha < 255))
            return true;
    }
#endif
    return false;
}

#ifdef ENABLE_CMP_API

CGU_UINT32 GetRamp2(CGU_UINT32 e0, CGU_UINT32 e1, CGU_UINT32 index, CGU_UINT32 indexprecision)
{
    if (indexprecision == 2)
        return (CGU_UINT32)(((64 - aWeight[2][index]) * e0 + aWeight[2][index] * e1 + 32) >> 6);
    else if (indexprecision == 3)
        return (CGU_UINT32)(((64 - aWeight[1][index]) * e0 + aWeight[1][index] * e1 + 32) >> 6);
    else  // indexprecision == 4
        return (CGU_UINT32)(((64 - aWeight[0][index]) * e0 + aWeight[0][index] * e1 + 32) >> 6);
}

//====================================== MODE 6 ==========================================
void cmp_encode_apply_swap(CMP_INOUT CGU_Vec4ui epo_code_out[2], CMP_INOUT CGU_UINT32 block_index[2], CMP_IN CGU_INT bits)
{
    CGU_UINT32 levels = 1 << bits;
    if ((block_index[0] & 15) >= levels / 2)
    {
        // swap end points
        CGU_Vec4ui t    = epo_code_out[0];
        epo_code_out[0] = epo_code_out[1];
        epo_code_out[1] = t;

        block_index[0] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[0];
        block_index[1] = (CGU_UINT32)(0x11111111 * (levels - 1)) - block_index[1];
    }
}

CGU_INT cmp_Write32Bit(CMP_INOUT CGU_UINT32 base[4], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT32 bitVal)
{
    base[offset / 32] |= ((CGU_UINT32)bitVal) << (offset % 32);
    if (offset % 32 + bits > 32)
    {
        if ((offset / 32 + 1) < 4)
            base[(offset / 32) + 1] |= cmp_shift_right_uint32(bitVal, 32 - offset % 32);
    }
    offset += bits;
    return offset;
}

void cmp_encode_index2(CMP_INOUT CGU_UINT32 data[4], CMP_IN CGU_INT pPos, CMP_INOUT CGU_UINT32 color_index[2], CMP_IN CGU_INT bits, CMP_IN CGU_INT flips)
{
    CGU_INT levels        = 1 << bits;
    CGU_INT flips_shifted = flips;
    for (CGU_INT k1 = 0; k1 < 2; k1++)
    {
        CGU_UINT32 qbits_shifted = color_index[k1];
        for (CGU_INT k2 = 0; k2 < 8; k2++)
        {
            CGU_UINT32 q = qbits_shifted & 15;
            if ((flips_shifted & 1) > 0)
                q = (levels - 1) - q;

            if (k1 == 0 && k2 == 0)
                pPos = cmp_Write32Bit(data, pPos, bits - 1, q);
            else
                pPos = cmp_Write32Bit(data, pPos, bits, q);
            qbits_shifted >>= 4;
            flips_shifted >>= 1;
        }
    }
}

void cmp_eigen_vector(CMP_INOUT CGV_Vec4f CMP_REFINOUT eigen_vector,
                      CMP_INOUT CGU_Vec4f CMP_REFINOUT image_mean,
                      CMP_IN CGV_Vec4ui image_src[16],
                      CMP_IN CGU_INT numEntries)
{
    CGU_INT k;
    image_mean   = 0.0f;
    eigen_vector = 0.0f;
    CGV_FLOAT vector_covOut[10];
    CGV_FLOAT covar[10] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
    CGV_Vec4f rgbasum   = {0.0f, 0.0f, 0.0f, 0.0f};

    for (k = 0; k < numEntries; k++)
    {
        CGV_Vec4f rgba;
        rgba.x = image_src[k].x;
        rgba.y = image_src[k].y;
        rgba.z = image_src[k].z;
        rgba.w = image_src[k].w;

        rgbasum.x += rgba.x;
        rgbasum.y += rgba.y;
        rgbasum.z += rgba.z;
        rgbasum.w += rgba.w;

        covar[0] += rgba.x * rgba.x;  //covar[0].x => covar[0]
        covar[1] += rgba.x * rgba.y;  //covar[0].y => covar[1]
        covar[2] += rgba.x * rgba.z;  //covar[0].z => covar[2]
        covar[3] += rgba.x * rgba.w;  //covar[0].w => covar[3]
        covar[4] += rgba.y * rgba.y;  //covar[1].y => covar[4]
        covar[5] += rgba.y * rgba.z;  //covar[1].z => covar[5]
        covar[6] += rgba.y * rgba.w;  //covar[1].w => covar[6]
        covar[7] += rgba.z * rgba.z;  //covar[2].z => covar[7]
        covar[8] += rgba.z * rgba.w;  //covar[2].w => covar[8]
        covar[9] += rgba.w * rgba.w;  //covar[3].w => covar[9]
    }

    image_mean = rgbasum / (CGV_FLOAT)numEntries;

    vector_covOut[0] = covar[0] - (rgbasum.x * rgbasum.x / numEntries);
    vector_covOut[1] = covar[1] - (rgbasum.x * rgbasum.y / numEntries);
    vector_covOut[2] = covar[2] - (rgbasum.x * rgbasum.z / numEntries);
    vector_covOut[3] = covar[3] - (rgbasum.x * rgbasum.w / numEntries);
    vector_covOut[4] = covar[4] - (rgbasum.y * rgbasum.y / numEntries);
    vector_covOut[5] = covar[5] - (rgbasum.y * rgbasum.z / numEntries);
    vector_covOut[6] = covar[6] - (rgbasum.y * rgbasum.w / numEntries);
    vector_covOut[7] = covar[7] - (rgbasum.z * rgbasum.z / numEntries);
    vector_covOut[8] = covar[8] - (rgbasum.z * rgbasum.w / numEntries);
    vector_covOut[9] = covar[9] - (rgbasum.w * rgbasum.w / numEntries);

    CGV_FLOAT inv_var = 1.0 / (256 * 256);  // GPU multiply is faster 1.5258789062500000e-05
    for (k = 0; k < 10; k++)
    {
        vector_covOut[k] = vector_covOut[k] * inv_var;
    }

    // Compute eigen_vector
    CGV_Vec4f vec             = {1.0f, 1.0f, 1.0f, 1.0f};
    CGU_INT   powerIterations = 6;  // 4 not enough for HQ : can use quality to set ranges from 2..n

    for (k = 0; k < powerIterations; k++)
    {
        eigen_vector.x = vector_covOut[0] * vec.x + vector_covOut[1] * vec.y + vector_covOut[2] * vec.z + vector_covOut[3] * vec.w;
        eigen_vector.y = vector_covOut[1] * vec.x + vector_covOut[4] * vec.y + vector_covOut[5] * vec.z + vector_covOut[6] * vec.w;
        eigen_vector.z = vector_covOut[2] * vec.x + vector_covOut[5] * vec.y + vector_covOut[7] * vec.z + vector_covOut[8] * vec.w;
        eigen_vector.w = vector_covOut[3] * vec.x + vector_covOut[6] * vec.y + vector_covOut[8] * vec.z + vector_covOut[9] * vec.w;

        // renormalize every other iteration
        if (k % 2 == 1)
        {
            CGV_FLOAT norm_sq = cmp_dot4f(eigen_vector, eigen_vector);
            CGV_FLOAT rnorm   = cmp_Image_rsqrt(norm_sq);
            vec               = eigen_vector * rnorm;
        }
        else
            vec = eigen_vector;
    }

    eigen_vector = vec;

    //printf("eigen_vector [%1.8f,%1.3f,%1.8f,%1.8f]\n", eigen_vector.x, eigen_vector.y, eigen_vector.z, eigen_vector.w);
}

void cmp_endpoints2(CMP_INOUT CGU_Vec4ui end_points_out[2], CMP_IN CGV_Vec4f ext[2], CMP_IN CGV_Vec4f eigen_vector, CMP_IN CGV_Vec4f image_mean)
{
    CGV_FLOAT levelHigh = 255;  // Mode 6 levels = 1 << bits = 128   then use (level * 2) - 1
    CGV_FLOAT levelLow  = 254;  // Mode 6 levels = 1 << bits = 128   then use (level * 2) - 2
    CGV_Vec4f qep_b[2];
    CGV_FLOAT err0 = 0.0f;
    CGV_FLOAT err1 = 0.0f;
    CGV_Vec4f block_endpoints[2];

    block_endpoints[0] = ext[0] * eigen_vector + image_mean;
    block_endpoints[1] = ext[1] * eigen_vector + image_mean;

    for (CGU_INT subset = 0; subset < 2; subset++)
    {  // this code effects quality
        qep_b[0].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);
        qep_b[0].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh) / 2.0f + 0.5f) * 2.0f, 0, levelLow);

        qep_b[1].x = cmp_clampf((CGV_INT)((block_endpoints[subset].x / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].y = cmp_clampf((CGV_INT)((block_endpoints[subset].y / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].z = cmp_clampf((CGV_INT)((block_endpoints[subset].z / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);
        qep_b[1].w = cmp_clampf((CGV_INT)((block_endpoints[subset].w / 255.0f * levelHigh - 1) / 2.0f + 0.5f) * 2 + 1, 1, levelHigh);

        err0 = cmp_dot4f(block_endpoints[subset] - qep_b[0], block_endpoints[subset] - qep_b[0]);
        err1 = cmp_dot4f(block_endpoints[subset] - qep_b[1], block_endpoints[subset] - qep_b[1]);
        if (subset == 0)
        {
            end_points_out[1].x = (err0 < err1) ? qep_b[0].x : qep_b[1].x;
            end_points_out[1].y = (err0 < err1) ? qep_b[0].y : qep_b[1].y;
            end_points_out[1].z = (err0 < err1) ? qep_b[0].z : qep_b[1].z;
            end_points_out[1].w = (err0 < err1) ? qep_b[0].w : qep_b[1].w;
        }
        else
        {
            end_points_out[0].x = ((err0 < err1) ? qep_b[0].x : qep_b[1].x);
            end_points_out[0].y = ((err0 < err1) ? qep_b[0].y : qep_b[1].y);
            end_points_out[0].z = ((err0 < err1) ? qep_b[0].z : qep_b[1].z);
            end_points_out[0].w = ((err0 < err1) ? qep_b[0].w : qep_b[1].w);
        }
    }
}

void cmp_block_endpoints(CMP_INOUT CGU_Vec4ui end_points_out[2],
                         CMP_IN CGV_Vec4f eigen_vector,
                         CMP_IN CGV_Vec4f image_mean,
                         CMP_IN CGU_Vec4ui image_src[16],
                         CMP_IN CGU_INT numEntries,     //IN: range 0..15 (MAX_SUBSET_SIZE)
                         CMP_IN CGU_INT partition_mask  // 0xFFFF:FFFF
)
{
    CGV_Vec4f ext[2] = {{255.0f, 255.0f, 255.0f, 255.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

    // find min/max
    CGV_INT mask_shifted = partition_mask << 1;
    for (CGU_INT k3 = 0; k3 <= numEntries; k3++)
    {
        mask_shifted >>= 1;
        if ((mask_shifted & 1) == 0)
            continue;

        CGV_FLOAT dot = 0;
        CGV_Vec4f diff;
        diff.x = image_src[k3].x - image_mean.x;
        diff.y = image_src[k3].y - image_mean.y;
        diff.z = image_src[k3].z - image_mean.z;
        diff.w = image_src[k3].w - image_mean.w;

        dot += cmp_dot4f(eigen_vector, diff);

        ext[0].x = cmp_minf(ext[0].x, dot);
        ext[0].y = cmp_minf(ext[0].y, dot);
        ext[0].z = cmp_minf(ext[0].z, dot);
        ext[0].w = cmp_minf(ext[0].w, dot);

        ext[1].x = cmp_maxf(ext[1].x, dot);
        ext[1].y = cmp_maxf(ext[1].y, dot);
        ext[1].z = cmp_maxf(ext[1].z, dot);
        ext[1].w = cmp_maxf(ext[1].w, dot);
    }

    // create some distance if the endpoints collapse
    if (ext[1].x - ext[0].x < 1.0f)
    {
        ext[0] -= 0.5f;
        ext[1] += 0.5f;
    }

    cmp_endpoints2(end_points_out, ext, eigen_vector, image_mean);
}

CGV_UINT8 clampIndex2(CGV_UINT8 v, CGV_UINT8 a, CGV_UINT8 b)
{
    if (v < a)
        return a;
    else if (v > b)
        return b;
    return v;
}

void cmp_block_index(CMP_INOUT CGU_UINT32 index_out[16],
                     CMP_IN CGV_Vec4f eigen_vector,
                     CMP_IN CGV_Vec4f image_mean,
                     CMP_IN CGU_Vec4ui image_src[16],
                     CMP_IN CGU_UINT32 numEntries      // Range 0..15 (MAX_SUBSET_SIZE)
)
{
    //=====================
    // Get Projected Index
    //=====================
    CGV_FLOAT image_projected[16];

    CGV_FLOAT image_v[16];
    CGV_FLOAT image_z[16];
    CGV_FLOAT projected_high;  // Values are +ve about centered image projection
    CGV_FLOAT projected_low;   // Values are -ve about centered image projection
    CGV_FLOAT image_s;

    //====================================================================
    // Center the image to new coordinate axis centered at the mean value
    //====================================================================
    CGV_Vec4f image_centered[16];
    CGV_Vec4f diff;
    for (CGU_UINT32 k1 = 0; k1 <= numEntries; k1++)
    {
        diff.x              = image_src[k1].x - image_mean.x;
        diff.y              = image_src[k1].y - image_mean.y;
        diff.z              = image_src[k1].z - image_mean.z;
        diff.w              = image_src[k1].w - image_mean.w;
        image_centered[k1]  = diff * eigen_vector;
        image_projected[k1] = image_centered[k1].x + image_centered[k1].y + image_centered[k1].z + image_centered[k1].w;
    }

    projected_high = image_projected[0];
    projected_low  = image_projected[0];

    for (CGU_UINT32 i1 = 1; i1 <= numEntries; i1++)
    {
        if (projected_high < image_projected[i1])
            projected_high = image_projected[i1];
        if (projected_low > image_projected[i1])
            projected_low = image_projected[i1];
    }

    CGV_FLOAT img_diff = projected_low - projected_high;

    if (img_diff == 0.0f)
        return;

    image_s = numEntries / img_diff;

    // Get initial index projection
    for (CGU_UINT32 idx = 0; idx <= numEntries; idx++)
    {
        image_v[idx]   = image_projected[idx] * image_s;
        image_z[idx]   = floor(image_v[idx] + 0.5F - projected_high * image_s);
        index_out[idx] = (CGV_UINT32)image_z[idx];
    }

    // get minimum index
    CGU_UINT32 index_min = index_out[0];
    for (CGU_UINT32 i3 = 1; i3 <= numEntries; i3++)
    {
        if (index_out[i3] < index_min)
            index_min = index_out[i3];
    }

    // Reposition all index by min index (using min index as 0)
    //printf("index : ");
    for (CGU_UINT32 i4 = 0; i4 <= numEntries; i4++)
    {
        index_out[i4] = clampIndex2(index_out[i4] - index_min, 0, 15);
        //printf("%02x,", index_out[i4]);
    }
    //printf("\n");
}

CGU_UINT32 cmp_calcblockerr(CGU_Vec4ui endPoint_in[2], CGU_Vec4ui image_src[16])
{
    CGU_UINT32 error = 0;
    CGU_Vec4ui pixel = image_src[0];
    CGU_Vec4ui endPoint[2];
    CGU_Vec4i  pixelDiff;

    endPoint[0] = endPoint_in[0];
    endPoint[1] = endPoint_in[1];
    pixelDiff.x = pixel.x - endPoint[0].x;
    pixelDiff.y = pixel.y - endPoint[0].y;
    pixelDiff.z = pixel.z - endPoint[0].z;
    pixelDiff.w = pixel.w - endPoint[0].w;

    CGU_Vec4i span;
    CGU_Vec2i span_norm_sqr;
    CGU_Vec2i dotProduct;

    span.x = endPoint[1].x - endPoint[0].x;
    span.y = endPoint[1].y - endPoint[0].y;
    span.z = endPoint[1].z - endPoint[0].z;
    span.w = endPoint[1].w - endPoint[0].w;

    span_norm_sqr = cmp_dotVec4i(span, span);
    dotProduct    = cmp_dotVec4i(span, pixelDiff);
    if (span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32(dotProduct.x * 63.49999) > CGU_UINT32(32 * span_norm_sqr.x))
    {
        span.x = -span.x;
        span.y = -span.y;
        span.z = -span.z;
        span.w = -span.w;
        swap(endPoint[0], endPoint[1]);
    }

    CGU_UINT32 color_index;
    CGU_Vec4ui pixel_r;
    for (CGU_UINT32 i = 0; i < 16; i++)
    {
        pixel = image_src[i];

        pixelDiff.x = pixel.x - endPoint[0].x;
        pixelDiff.y = pixel.y - endPoint[0].y;
        pixelDiff.z = pixel.z - endPoint[0].z;
        pixelDiff.w = pixel.w - endPoint[0].w;

        dotProduct.x = cmp_dotVec4i(span, pixelDiff);
        color_index  = (span_norm_sqr.x <= 0 || dotProduct.x <= 0)
                          ? 0
                          : ((dotProduct.x < span_norm_sqr.x) ? aStep[0][CGU_UINT32(dotProduct.x * 63.49999 / span_norm_sqr.x)] : aStep[0][63]);

        pixel_r = (endPoint[0] * (64 - aWeight[0][color_index]) + endPoint[1] * aWeight[0][color_index] + 32u) >> 6;

        Ensure_A_Is_Larger(pixel_r, pixel);
        pixel_r -= pixel;
        error += ComputeError(pixel_r, pixel_r);
    }

    return error;
}

CGU_FLOAT cmp_GetIndexedEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
                                  CMP_INOUT CGU_UINT32 index_out[16],
                                  CMP_IN CGU_Vec4ui image_src[16],
                                  CMP_IN CGU_INT numEntries,
                                  CMP_IN CGU_INT partition_mask)
{
    CGV_Vec4f image_mean = {0.0f, 0.0f, 0.0f, 0.0f};
    CGV_Vec4f eigen_vector;

    for (CGU_INT i0 = 0; i0 < 16; i0++)
        index_out[i0] = 0;

    cmp_eigen_vector(eigen_vector, image_mean, image_src, numEntries);

    cmp_block_endpoints(epo_code_out, eigen_vector, image_mean, image_src, numEntries, partition_mask);

    cmp_block_index(index_out, eigen_vector, image_mean, image_src, numEntries);

    CGU_UINT32 besterr = cmp_calcblockerr(epo_code_out, image_src);

    return besterr;
}

void cmp_encode_mode6(CMP_INOUT CGU_UINT32 cmp_out[4], CMP_IN CGU_Vec4ui epo_code_out[2], CMP_IN CGU_UINT32 packed_color_index[2])
{
    cmp_encode_apply_swap(epo_code_out, packed_color_index, 4);
    CGU_INT k;

    for (k = 0; k < 4; k++)
        cmp_out[k] = 0;

    CGU_INT pos = 0;

    // mode 6
    pos = cmp_Write32Bit(cmp_out, pos, 7, 64);

    // endpoints
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].x >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].x >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].y >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].y >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].z >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].z >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[0].w >> 1);
    pos = cmp_Write32Bit(cmp_out, pos, 7, epo_code_out[1].w >> 1);

    // p bits
    pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[0].x & 1);
    pos = cmp_Write32Bit(cmp_out, pos, 1, epo_code_out[1].x & 1);

    // quantized values
    cmp_encode_index2(cmp_out, pos, packed_color_index, 4, 0);
}

//====================================== MODES 01237 ==========================================
CGU_UINT32 index_collapse2(CMP_INOUT CGU_UINT32 index[16], CGU_UINT32 numEntries)
{
    CGU_UINT32 minIndex = index[0];
    CGU_UINT32 MaxIndex = index[0];

    for (CGU_UINT32 km = 1; km < numEntries; km++)
    {
        if (index[km] < minIndex)
            minIndex = index[km];
        if (index[km] > MaxIndex)
            MaxIndex = index[km];
    }

    if (MaxIndex == 0)
        return 0;

    CGU_UINT32 D = 1;

    for (CGU_UINT32 d = 2; d <= MaxIndex - minIndex; d++)
    {
        for (CGU_UINT32 ent = 0U; ent < numEntries; ent++)
        {
            CGU_UINT8 imod = (index[ent] - minIndex);
            if (fmod(imod, d) > 0.0f)
            {
                if (ent >= numEntries)
                    D = d;
                break;
            }
        }
    }

    CGU_FLOAT invD = 1.0f / D;
    for (CGU_UINT32 ki = 0; ki < numEntries; ki++)
    {
        index[ki] = (CGU_UINT32)((index[ki] - minIndex) * invD);
    }

    for (CGU_UINT32 k = 1; k < numEntries; k++)
    {
        if (index[k] > MaxIndex)
            MaxIndex = index[k];
    }

    return (MaxIndex);
}

INLINE void GetClusterMean2(CMP_INOUT CGV_Vec4f image_cluster_mean[16],
                            CMP_IN CGU_Vec4ui image_src[16],
                            CMP_IN CGU_UINT32 index_cluster[16],
                            CMP_IN CGU_UINT32 numEntries,  // < 16
                            CMP_IN CGU_UINT32 channels3or4)
{  // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
    // unused index values are underfined
    CGU_UINT32 i_cnt[16];
    CGU_UINT32 i_comp[16];
    CGU_UINT32 idx;

    for (CGU_UINT32 i0 = 0; i0 < numEntries; i0++)
    {
        idx                     = index_cluster[i0] & 0x0F;
        i_cnt[idx]              = 0;
        image_cluster_mean[idx] = 0.0f;
    }

    CGU_UINT32 ic = 0;
    for (CGU_UINT32 i1 = 0; i1 < numEntries; i1++)
    {
        idx = index_cluster[i1] & 0x0F;
        if (i_cnt[idx] == 0)
            i_comp[ic++] = idx;
        i_cnt[idx]++;

        image_cluster_mean[idx].x += image_src[i1].x;
        image_cluster_mean[idx].y += image_src[i1].y;
        image_cluster_mean[idx].z += image_src[i1].z;
        image_cluster_mean[idx].w += image_src[i1].w;
    }

    for (CGU_UINT32 i = 0; i < ic; i++)
    {
        CGU_UINT32 icmp = i_comp[i];
        if (i_cnt[icmp] != 0)
        {
            image_cluster_mean[icmp].x = (CGV_FLOAT)floor((image_cluster_mean[icmp].x / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            image_cluster_mean[icmp].y = (CGV_FLOAT)floor((image_cluster_mean[icmp].y / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            image_cluster_mean[icmp].z = (CGV_FLOAT)floor((image_cluster_mean[icmp].z / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            if (channels3or4 == 4)
                image_cluster_mean[icmp].w = (CGV_FLOAT)floor((image_cluster_mean[icmp].w / (CGV_FLOAT)i_cnt[icmp]) + 0.5F);
            else
                image_cluster_mean[icmp].w = 0.0f;
        }
    }
}

#ifndef ASPM_HLSL // CPU Version

#define USE_OLDCODE

INLINE CGU_UINT8 cmp_get_partition_subset2(CMP_IN CGU_INT part_id, CMP_IN CGU_INT maxSubsets, CMP_IN CGU_INT index)
{
    if (maxSubsets == 2)
    {
        CGU_UINT32 mask_packed = subset_mask_table2[part_id];
        return ((mask_packed & (0x01 << index)) ? 1 : 0);  // This can be moved to caller, just return mask!!
    }

    // 3 region subsets
    part_id += 64;
    CGU_UINT32 mask0 = subset_mask_table2[part_id] & 0xFFFF;
    CGU_UINT32 mask1 = subset_mask_table2[part_id] >> 16;
    CGU_UINT32 mask  = 0x01 << index;

    return ((mask1 & mask) ? 2 : 0 + (mask0 & mask) ? 1 : 0);  // This can be moved to caller, just return mask!!
}

void cmp_GetPartitionSubSet2_mode01237(CMP_INOUT CGV_Vec4ui image_subsets[3][16],  // OUT: Subset pattern mapped with image src colors
                                       CMP_INOUT CGU_INT entryCount_out[3],        // OUT: Number of entries per subset
                                       CMP_IN CGU_UINT8 partition,                 // Partition Shape 0..63
                                       CMP_IN CGV_Vec4ui image_src[16],            // Image colors
                                       CMP_IN CGU_INT blockMode,                   // [0,1,2,3 or 7]
                                       CMP_IN CGU_UINT8 channels3or4)
{  // 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
    CGU_UINT8 maxSubsets = 2;
    if (blockMode == 0 || blockMode == 2)
        maxSubsets = 3;

    entryCount_out[0] = 0;
    entryCount_out[1] = 0;
    entryCount_out[2] = 0;

    for (CGU_INT i = 0; i < 16; i++)
    {
        CGU_UINT8 subset = cmp_get_partition_subset2(partition, maxSubsets, i);

        image_subsets[subset][entryCount_out[subset]].x = image_src[i].x;
        image_subsets[subset][entryCount_out[subset]].y = image_src[i].y;
        image_subsets[subset][entryCount_out[subset]].z = image_src[i].z;

        // if we have only 3 channels then set the alpha subset to 0
        if (channels3or4 == 3)
            image_subsets[subset][entryCount_out[subset]].w = 0.0F;
        else
            image_subsets[subset][entryCount_out[subset]].w = image_src[i].w;
        entryCount_out[subset]++;
    }
}

void cmp_GetImageCentered(CMP_INOUT CGV_Vec4f image_centered[16],
                          CMP_INOUT CGV_Vec4f CMP_REFINOUT mean_out,
                          CMP_IN CGV_Vec4ui image_src[16],
                          CMP_IN CGU_INT numEntries,
                          CMP_IN CGU_UINT8 channels3or4)
{
    (channels3or4);

    mean_out = 0.0f;
    CGU_INT k;

    for (k = 0; k < numEntries; k++)
    {
        mean_out.x = mean_out.x + image_src[k].x;
        mean_out.y = mean_out.y + image_src[k].y;
        mean_out.z = mean_out.z + image_src[k].z;
        if (channels3or4 == 4)
            mean_out.w = mean_out.w + image_src[k].w;
    }

    mean_out /= (CGV_FLOAT)numEntries;

    for (k = 0; k < numEntries; k++)
    {
        image_centered[k].x = image_src[k].x - mean_out.x;
        image_centered[k].y = image_src[k].y - mean_out.y;
        image_centered[k].z = image_src[k].z - mean_out.z;
        if (channels3or4 == 4)
            image_centered[k].w = image_src[k].w - mean_out.w;
    }
}

void cmp_GetCovarianceVector(CMP_INOUT CGV_FLOAT covariance_out[16],
                             CMP_IN CGV_Vec4f image_centered[16],
                             CMP_IN CGU_INT numEntries,
                             CMP_IN CGU_UINT8 channels3or4)
{
    CGU_UINT8 ch1;
    CGU_UINT8 ch2;
    CGU_INT   k;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 <= ch1; ch2++)
        {
            covariance_out[ch1 + ch2 * 4] = 0;
            for (k = 0; k < numEntries; k++)
                covariance_out[ch1 + ch2 * 4] += image_centered[k][ch1] * image_centered[k][ch2];
        }

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = ch1 + 1; ch2 < channels3or4; ch2++)
            covariance_out[ch1 + ch2 * 4] = covariance_out[ch2 + ch1 * 4];
}

void cmp_GetEigenVector(CMP_INOUT CGV_Vec4f CMP_REFINOUT EigenVector_out,  // Normalized Eigen Vector output
                        CMP_IN CGV_FLOAT CovarianceVector[16],             // Covariance Vector
                        CMP_IN CGU_UINT8 channels3or4)
{
    CGV_FLOAT vector_covIn[16];
    CGV_FLOAT vector_covOut[16];
    CGV_FLOAT vector_maxCovariance;
    CGU_UINT8 ch1;
    CGU_UINT8 ch2;
    CGU_UINT8 ch3;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            vector_covIn[ch1 + ch2 * 4] = CovarianceVector[ch1 + ch2 * 4];
        }

    vector_maxCovariance = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_covIn[ch1 + ch1 * 4] > vector_maxCovariance)
            vector_maxCovariance = vector_covIn[ch1 + ch1 * 4];
    }

    // Normalize Input Covariance Vector
    for (ch1 = 0; ch1 < channels3or4; ch1++)
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            if (vector_maxCovariance > 0)
                vector_covIn[ch1 + ch2 * 4] = vector_covIn[ch1 + ch2 * 4] / vector_maxCovariance;
        }

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        for (ch2 = 0; ch2 < channels3or4; ch2++)
        {
            CGV_FLOAT vector_temp_cov = 0;
            for (ch3 = 0; ch3 < channels3or4; ch3++)
            {
                vector_temp_cov = vector_temp_cov + vector_covIn[ch1 + ch3 * 4] * vector_covIn[ch3 + ch2 * 4];
            }
            vector_covOut[ch1 + ch2 * 4] = vector_temp_cov;
        }
    }

    vector_maxCovariance = 0;

    CGU_INT maxCovariance_channel = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_covOut[ch1 + ch1 * 4] > vector_maxCovariance)
        {
            maxCovariance_channel = ch1;
            vector_maxCovariance  = vector_covOut[ch1 + ch1 * 4];
        }
    }

    CGV_FLOAT vector_t = 0;

    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        vector_t             = vector_t + vector_covOut[maxCovariance_channel + ch1 * 4] * vector_covOut[maxCovariance_channel + ch1 * 4];
        EigenVector_out[ch1] = vector_covOut[maxCovariance_channel + ch1 * 4];
    }

    // Normalize the Eigen Vector
    vector_t = sqrt(vector_t);
    for (ch1 = 0; ch1 < channels3or4; ch1++)
    {
        if (vector_t > 0)
            EigenVector_out[ch1] = EigenVector_out[ch1] / vector_t;
    }
}

void cmp_GetProjecedImage(CMP_INOUT CGV_FLOAT projection_out[16],
                          CMP_IN CGV_Vec4f image_centered[16],
                          CMP_IN CGU_INT numEntries,
                          CMP_IN CGV_Vec4f EigenVector,
                          CMP_IN CGU_UINT8 channels3or4)
{
    // EigenVector must be normalized
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        projection_out[k] = 0.0F;
        projection_out[k] = projection_out[k] + (image_centered[k].x * EigenVector.x);
        projection_out[k] = projection_out[k] + (image_centered[k].y * EigenVector.y);
        projection_out[k] = projection_out[k] + (image_centered[k].z * EigenVector.z);
        if (channels3or4 == 4)
            projection_out[k] = projection_out[k] + (image_centered[k].w * EigenVector.w);
    }
}

typedef struct
{
    CGV_FLOAT image;
    CGU_UINT8 index;
} CMP_di2;

void cmp_GetProjectedIndex(CMP_INOUT CGU_UINT8 projected_index_out[16],  //output: index, uncentered, in the range 0..clusters-1
                           CMP_IN CGV_FLOAT image_projected[16],         // image_block points, might be uncentered
                           CMP_IN CGU_INT clusters,                      // clusters: number of points in the ramp   (max 16)
                           CMP_IN CGU_INT numEntries)
{
    CMP_di2   what[16];
    CGV_FLOAT image_v[16];
    CGV_FLOAT image_z[16];
    CGV_FLOAT image_l;
    CGV_FLOAT image_mm;
    CGV_FLOAT image_r  = 0.0F;
    CGV_FLOAT image_dm = 0.0F;
    CGV_FLOAT image_min;
    CGV_FLOAT image_max;
    CGV_FLOAT image_s;

    CGU_INT i;
    CGU_INT j;

    for (i = 0; i < 16; i++)
        projected_index_out[i] = 0;

    image_min = image_projected[0];
    image_max = image_projected[0];

    for (i = 1; i < numEntries; i++)
    {
        if (image_min < image_projected[i])
            image_min = image_projected[i];
        if (image_max > image_projected[i])
            image_max = image_projected[i];
    }

    CGV_FLOAT img_diff = image_max - image_min;

    if (img_diff == 0.0f)
        return;
    if (cmp_isnan(img_diff))
        return;

    image_s = (clusters - 1) / img_diff;

    for (i = 0; i < numEntries; i++)
    {
        image_v[i]             = image_projected[i] * image_s;
        image_z[i]             = floor(image_v[i] + 0.5F - image_min * image_s);
        projected_index_out[i] = (CGU_UINT8)image_z[i];

        what[i].image = image_v[i] - image_z[i] - image_min * image_s;
        what[i].index = i;
        image_dm += what[i].image;
        image_r += what[i].image * what[i].image;
    }

    if (numEntries * image_r - image_dm * image_dm >= (CGV_FLOAT)(numEntries - 1) / 8)
    {
        image_dm /= numEntries;

        for (i = 0; i < numEntries; i++)
            what[i].image -= image_dm;

        CGU_UINT8 tmp_index;
        CGV_FLOAT tmp_image;
        for (i = 1; i < numEntries; i++)
        {
            for (j = i; j > 0; j--)
            {
                if (what[j - 1].image > what[j].image)
                {
                    tmp_index         = what[j].index;
                    tmp_image         = what[j].image;
                    what[j].index     = what[j - 1].index;
                    what[j].image     = what[j - 1].image;
                    what[j - 1].index = tmp_index;
                    what[j - 1].image = tmp_image;
                }
            }
        }

        // got into fundamental simplex
        // move coordinate system origin to its center

        // i=0 < numEntries avoids varying int division by 0
        for (i = 0; i < numEntries; i++)
        {
            what[i].image = what[i].image - (CGV_FLOAT)(((2.0f * i + 1) - numEntries) / (2.0f * numEntries));
        }

        image_mm = 0.0F;
        image_l  = 0.0F;

        j = -1;
        for (i = 0; i < numEntries; i++)
        {
            image_l += what[i].image;
            if (image_l < image_mm)
            {
                image_mm = image_l;
                j        = i;
            }
        }

        j = j + 1;
        // avoid  j = j%numEntries use this
        while (j > numEntries)
            j = j - numEntries;

        for (i = j; i < numEntries; i++)
        {
            CGU_UINT8 idx            = what[i].index;
            CGU_UINT8 pidx           = projected_index_out[idx] + 1;  //gather_index(projected_index_out,idx)+1;
            projected_index_out[idx] = pidx;                          // scatter_index(projected_index_out,idx,pidx);
        }
    }

    // get minimum index
    CGU_UINT8 index_min = projected_index_out[0];
    for (i = 1; i < numEntries; i++)
    {
        if (projected_index_out[i] < index_min)
            index_min = projected_index_out[i];
    }

    // reposition all index by min index (using min index as 0)
    for (i = 0; i < numEntries; i++)
    {
        projected_index_out[i] = cmp_clampi(projected_index_out[i] - index_min, 0, 15);
    }
}

CGV_FLOAT cmp_err_Total(CMP_IN CGV_Vec4ui image_src1[16], CMP_IN CGV_Vec4f image_src2[16], CMP_IN CGU_INT numEntries, CMP_IN CGU_UINT8 channels3or4)
{
    CGV_FLOAT err_t = 0.0F;
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        err_t = err_t + cmp_squaref(image_src1[k].x - image_src2[k].x);
        err_t = err_t + cmp_squaref(image_src1[k].y - image_src2[k].y);
        err_t = err_t + cmp_squaref(image_src1[k].z - image_src2[k].z);
        if (channels3or4 == 4)
            err_t = err_t + cmp_squaref(image_src1[k].w - image_src2[k].w);
    }
    return err_t;
};


CGV_FLOAT cmp_GetQuantizeIndex_old(CMP_INOUT CGU_UINT8 index_out[16],
                                   CMP_IN CGV_Vec4ui image_src[16],
                                   CMP_IN CGU_INT numEntries,
                                   CMP_IN CGU_INT numClusters,
                                   CMP_IN CGU_UINT8 channels3or4)
{
    CGV_FLOAT covariance_vector[16];
    CGV_Vec4f image_centered[16];
    CGV_FLOAT image_projected[16];
    CGV_Vec4f image_mean   = 0.0f;
    CGV_Vec4f eigen_vector = 0.0f;

    // Init vars
    for (CGU_INT ik = 0; ik < 16; ik++)
    {
        covariance_vector[ik] = 0.0f;
        image_centered[ik]    = 0.0f;
        image_projected[ik]   = 0.0f;
    }

    cmp_GetImageCentered(image_centered, image_mean, image_src, numEntries, channels3or4);
    cmp_GetCovarianceVector(covariance_vector, image_centered, numEntries, channels3or4);

    //-----------------------------------------------------
    // check if all covariances are the same
    // if so then set all index to same value 0 and return
    // use EPSILON to set the limit for all same limit
    //-----------------------------------------------------

    CGV_FLOAT image_covt = 0.0F;

    image_covt = covariance_vector[0];
    image_covt = image_covt + covariance_vector[5];
    image_covt = image_covt + covariance_vector[10];
    if (channels3or4 == 4)
        image_covt = image_covt + covariance_vector[15];

    if (image_covt < 0.00390625f)
    {
        for (CGU_INT i = 0; i < 16; i++)
            index_out[i] = 0;
        return 0.0f;
    }

    cmp_GetEigenVector(eigen_vector, covariance_vector, channels3or4);

    cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
    cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);

    //==========================================
    // Refine
    //==========================================
    CGV_FLOAT image_q = 0.0F;
    eigen_vector      = 0.0f;

    for (CGU_INT k = 0; k < numEntries; k++)
    {
        eigen_vector.x = eigen_vector.x + image_centered[k].x * index_out[k];
        eigen_vector.y = eigen_vector.y + image_centered[k].y * index_out[k];
        eigen_vector.z = eigen_vector.z + image_centered[k].z * index_out[k];
        if (channels3or4 == 4)
            eigen_vector.w = eigen_vector.w + image_centered[k].w * index_out[k];
    }

    image_q = image_q + eigen_vector.x * eigen_vector.x;
    image_q = image_q + eigen_vector.y * eigen_vector.y;
    image_q = image_q + eigen_vector.z * eigen_vector.z;
    if (channels3or4 == 4)
        image_q = image_q + eigen_vector.w * eigen_vector.w;

    image_q = sqrt(image_q);

    // direction needs to be normalized
    if (image_q != 0.0F)
        eigen_vector = eigen_vector / image_q;

    // Get new projected data
    cmp_GetProjecedImage(image_projected, image_centered, numEntries, eigen_vector, channels3or4);
    cmp_GetProjectedIndex(index_out, image_projected, numClusters, numEntries);

    // Calc Error
    CGV_FLOAT image_t       = 0.0F;
    CGV_FLOAT index_average = 0.0F;

    for (CGU_INT ik = 0; ik < numEntries; ik++)
    {
        index_average = index_average + index_out[ik];
        image_t       = image_t + index_out[ik] * index_out[ik];
    }

    index_average = index_average / (CGV_FLOAT)numEntries;
    image_t       = image_t - index_average * index_average * (CGV_FLOAT)numEntries;

    if (image_t != 0.0F)
        image_t = 1.0F / image_t;

    eigen_vector = 0.0f;

    for (CGU_INT nk = 0; nk < numEntries; nk++)
    {
        eigen_vector.x = eigen_vector.x + image_centered[nk].x * index_out[nk];
        eigen_vector.y = eigen_vector.y + image_centered[nk].y * index_out[nk];
        eigen_vector.z = eigen_vector.z + image_centered[nk].z * index_out[nk];
        if (channels3or4 == 4)
            eigen_vector.w = eigen_vector.w + image_centered[nk].w * index_out[nk];
    }

    CGV_Vec4f image_decomp[SOURCE_BLOCK_SIZE];
    for (CGU_UINT32 ii = 0; ii < SOURCE_BLOCK_SIZE; ii++)
        image_decomp[ii] = 0.0f;

    for (CGU_INT i = 0; i < numEntries; i++)
    {
        image_decomp[i].x = image_mean.x + eigen_vector.x * image_t * (index_out[i] - index_average);
        image_decomp[i].y = image_mean.y + eigen_vector.y * image_t * (index_out[i] - index_average);
        image_decomp[i].z = image_mean.z + eigen_vector.z * image_t * (index_out[i] - index_average);
        if (channels3or4 == 4)
            image_decomp[i].w = image_mean.w + eigen_vector.w * image_t * (index_out[i] - index_average);
    }

    CGV_FLOAT err_1 = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);

    return err_1;
}

typedef struct
{
    CGV_FLOAT image;
    CGU_UINT8 index;
} CMP_du2;

void cmp_sortPartitionProjection(CMP_IN CGV_FLOAT projection[64], CMP_INOUT CGU_UINT8 order[64],
                                 CMP_IN CGU_UINT8 numPartitions)  // max 64
{
    CMP_du2   what[64];
    CGU_UINT8 Parti;
    CGU_UINT8 Partj;

    for (Parti = 0; Parti < numPartitions; Parti++)
    {
        what[Parti].index = Parti;
        what[Parti].image = projection[Parti];
    }

    CGU_UINT8 index;
    CGV_FLOAT data;

    for (Parti = 1; Parti < numPartitions; Parti++)
    {
        for (Partj = Parti; Partj > 0; Partj--)
        {
            if (what[Partj - 1].image > what[Partj].image)
            {
                index                 = what[Partj].index;
                data                  = what[Partj].image;
                what[Partj].index     = what[Partj - 1].index;
                what[Partj].image     = what[Partj - 1].image;
                what[Partj - 1].index = index;
                what[Partj - 1].image = data;
            }
        }
    }

    for (Parti = 0; Parti < numPartitions; Parti++)
        order[Parti] = what[Parti].index;
};




CGU_BOOL cmp_get_ideal_cluster(CMP_INOUT CGV_Vec4f image_cluster[2],
                               CMP_IN CGU_UINT32 index_cluster[16],
                               CMP_IN CGU_INT Mi_,
                               CMP_IN CGV_Vec4ui image_src[16],
                               CMP_IN CGU_INT numEntries,
                               CMP_IN CGU_UINT8 channels3or4)
{
    // get ideal cluster centers
    CGV_Vec4f image_cluster_mean[16];

    for (CGU_INT ii = 0; ii < numEntries; ii++)
    {
        image_cluster_mean[ii] = 0.0f;
    }

    GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4);  // unrounded

    CGV_FLOAT image_matrix0[2] = {0, 0};  // matrix /inverse matrix
    CGV_FLOAT image_matrix1[2] = {0, 0};  // matrix /inverse matrix
    CGV_Vec4f image_rp[2];                // right part for RMS fit problem

    image_rp[0] = 0.0f;
    image_rp[1] = 0.0f;

    // weight with cnt if runnning on compacted index
    for (CGU_INT k = 0; k < numEntries; k++)
    {
        image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
        image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]);  // im is symmetric
        image_matrix1[1] += index_cluster[k] * index_cluster[k];

        image_rp[0] += image_cluster_mean[index_cluster[k]] * (Mi_ - index_cluster[k]);
        image_rp[1] += image_cluster_mean[index_cluster[k]] * index_cluster[k];
    }

    CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];

    // assert(matrix_dd !=0);
    // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
    // taken care of separately
    if (matrix_dd == 0)
    {
        image_cluster[0] = 0.0f;
        image_cluster[1] = 0.0f;
        return FALSE;
    }

    image_matrix1[0] = image_matrix0[0];
    image_matrix0[0] = image_matrix1[1] / matrix_dd;
    image_matrix1[1] = image_matrix1[0] / matrix_dd;

    image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;

    CGV_FLOAT Mif = (CGV_FLOAT)Mi_;

    // values can exceed 255 here, clamp made no diff in quality!
    image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
    image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);

    return TRUE;
}

CGV_FLOAT cmp_quant_solid_color(CMP_INOUT CGU_UINT32 index_out[16],
                                CMP_INOUT CGV_Vec4ui epo_code_out[2],
                                CMP_IN CGV_Vec4ui image_src[16],
                                CMP_IN CGU_INT numEntries,
                                CMP_IN CGU_UINT8 Mi_,
                                CMP_IN CGU_UINT8 bits[4],
                                CMP_IN CGU_INT type,
                                CMP_IN CGU_UINT8 channels3or4,
                                CMP_IN CGU_INT blockMode)
{
#ifndef ASPM_GPU
#if defined(USE_NEW_SP_ERR_IDX) 
    CGU_INT clogBC7 = 0;
    CGU_INT iv      = Mi_ + 1;
    while (iv >>= 1)
        clogBC7++;

    old_init_BC7ramps(); // first time call inits global
#endif
#endif

    CGU_INT    index_bits = g_modesettings[blockMode].indexBits;
    CGV_Vec4ui epo_0[2];

    epo_0[0] = 0u;
    epo_0[1] = 0u;

    CGU_UINT8 image_log = 0;
    CGU_UINT8 image_idx = 0;
    CGU_BOOL  use_par   = FALSE;
    if (type != 0)
        use_par = TRUE;
    CGV_FLOAT error_1 = CMP_FLOAT_MAX;
    //CGU_UINT8 ch;
    CGU_UINT8 ch1;
    //CGU_INT   k;
    CGU_INT   i;

    for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type] && (error_1 != 0.0F); pn++)
    {
        CGU_Vec4ui o1[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};
        CGU_Vec4ui o2[2] = {{0u, 0u, 0u, 0u}, {2u, 2u, 2u, 2u}};

        if (use_par == TRUE)
        {
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0])
                o1[0][0] = 1;
            else
                o1[1][0] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0])
                o2[0][0] = 1;
            else
                o2[1][0] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1])
                o1[0][1] = 1;
            else
                o1[1][1] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1])
                o2[0][1] = 1;
            else
                o2[1][1] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2])
                o1[0][2] = 1;
            else
                o1[1][2] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2])
                o2[0][2] = 1;
            else
                o2[1][2] = 1;

            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3])
                o1[0][3] = 1;
            else
                o1[1][3] = 1;
            if (cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3])
                o2[0][3] = 1;
            else
                o2[1][3] = 1;
        }
        CGU_INT   image_tcr[MAX_CHANNELS];
        CGU_INT   epo_dr_0[MAX_CHANNELS];
        CGV_FLOAT error_0 = CMP_FLOAT_MAX;

        for (CGU_UINT8 iclogBC7 = 0; iclogBC7 < (1 << index_bits) && (error_0 != 0); iclogBC7++)
        {
            CGV_FLOAT error_t = 0;
            CGU_INT   t1o[MAX_CHANNELS], t2o[MAX_CHANNELS];

            for (ch1 = 0; ch1 < channels3or4; ch1++)
            {
                // D
                CGV_FLOAT error_ta = CMP_FLOAT_MAX;

                for (CGU_UINT8 t1 = o1[0][ch1]; t1 < o1[1][ch1]; t1++)
                {
                    // C
                    // This is needed for non-integer mean points of "collapsed" sets
                    for (CGU_UINT8 t2 = o2[0][ch1]; t2 < o2[1][ch1]; t2++)
                    {
                        // B
                        CGU_INT image_tf;
                        CGU_INT image_tc;
                        image_tf = (CGU_INT)floor(image_src[0][ch1]);
                        image_tc = (CGU_INT)ceil(image_src[0][ch1]);
#ifndef ASPM_GPU
#ifdef USE_NEW_SP_ERR_IDX
                        CGV_FLOAT err_tf = old_get_sperr(clogBC7, bits[ch1], image_tf, t1, t2, iclogBC7);
                        CGV_FLOAT err_tc = old_get_sperr(clogBC7, bits[ch1], image_tc, t1, t2, iclogBC7);
                        if (err_tf > err_tc)
                            image_tcr[ch1] = image_tc;
                        else if (err_tf < err_tc)
                            image_tcr[ch1] = image_tf;
                        else
                            image_tcr[ch1] = (CGV_INT)floor(image_src[ch1][COMP_RED] + 0.5F);
                        
                        //===============================
                        // Refine this for better quality!
                        //===============================
                        CGV_FLOAT error_tr;
                        error_tr = old_get_sperr(clogBC7, bits[ch1], image_tcr[ch1], t1, t2, iclogBC7);
                        error_tr = (error_tr * error_tr) + 
                                   2 * error_tr * old_img_absf(image_tcr[ch1] - image_src[ch1][COMP_RED]) +
                                   (image_tcr[ch1] - image_src[ch1][COMP_RED]) * (image_tcr[ch1] - image_src[ch1][COMP_RED]);

                        if (error_tr < error_ta)
                        {
                            error_ta      = error_tr;
                            t1o[ch1]      = t1;
                            t2o[ch1]      = t2;
                            epo_dr_0[ch1] = cmp_clampi(image_tcr[ch1], 0, 255);
                        }
#endif
#else
                        image_tcr[ch1] = (CGU_INT)floor(image_src[0][ch1] + 0.5F);
                        error_ta       = 0;
                        t1o[ch1]       = t1;
                        t2o[ch1]       = t2;
                        epo_dr_0[ch1]  = cmp_clampi(image_tcr[ch1], 0, 255);
#endif
                      
                    }  // B
                }      //C

                error_t += error_ta;
            }  // D

            if (error_t <= error_0)
            {
                // We have a solid color: Use image src if on GPU
                image_log = iclogBC7;
                image_idx = image_log;

#ifndef ASPM_GPU
#ifdef USE_BC7_SP_ERR_IDX
               if (BC7EncodeRamps2.ramp_init) {
                   for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
                   {
                       CGV_INT index = (CLT2(clogBC7) * 4 * 256 * 2 * 2 * 16 * 2) + 
                                       (BTT2(bits[ch]) * 256 * 2 * 2 * 16 * 2) + 
                                       (epo_dr_0[ch] * 2 * 2 * 16 * 2) +
                                       (t1o[ch] * 2 * 16 * 2) + 
                                       (t2o[ch] * 16 * 2) + 
                                       (iclogBC7 * 2);
                       epo_0[0][ch] = BC7EncodeRamps2.sp_idx[index + 0] & 0xFF;
                       epo_0[1][ch] = BC7EncodeRamps2.sp_idx[index + 1] & 0xFF;
                   }
               }
#endif
#else
                CGU_UINT8 ch;
                CGU_UINT8 k;
                // This needs improving
                CGV_FLOAT MinC[4] = {255, 255, 255, 255};
                CGV_FLOAT MaxC[4] = {0, 0, 0, 0};
                // get min max colors
                for (ch = 0; ch < channels3or4; ch++)
                    for (k = 0; k < numEntries; k++)
                    {
                        if (image_src[k][ch] < MinC[ch])
                            MinC[ch] = image_src[k][ch];
                        if (image_src[k][ch] > MaxC[ch])
                            MaxC[ch] = image_src[k][ch];
                    }
                epo_0[0][0] = (CGU_UINT8)MinC[0];
                epo_0[1][0] = (CGU_UINT8)MaxC[0];
                epo_0[0][1] = (CGU_UINT8)MinC[1];
                epo_0[1][1] = (CGU_UINT8)MaxC[1];
                epo_0[0][2] = (CGU_UINT8)MinC[2];
                epo_0[1][2] = (CGU_UINT8)MaxC[2];
                epo_0[0][3] = (CGU_UINT8)MinC[3];
                epo_0[1][3] = (CGU_UINT8)MaxC[3];
#endif

                error_0     = error_t;
            }

        }  // E

        if (error_0 < error_1)
        {
            image_idx       = image_log;
            epo_code_out[0] = epo_0[0];
            epo_code_out[1] = epo_0[1];
            error_1         = error_0;
        }

    }  //1

    // Get Image error
    CGV_Vec4f image_decomp[16];
    for (i = 0; i < numEntries; i++)
    {
        index_out[i] = image_idx;
        {
            image_decomp[i][0] = cmp_GetRamp(index_bits, bits[0], epo_code_out[0].x, epo_code_out[1].x, i);
            image_decomp[i][1] = cmp_GetRamp(index_bits, bits[1], epo_code_out[0].y, epo_code_out[1].y, i);
            image_decomp[i][2] = cmp_GetRamp(index_bits, bits[2], epo_code_out[0].z, epo_code_out[1].z, i);
            if (channels3or4 == 4)
                image_decomp[i][3] = cmp_GetRamp(index_bits, bits[3], epo_code_out[0].w, epo_code_out[1].w, i);
        }
    }
    // Do we need to do this rather then err_1 * numEntries
    CGV_FLOAT error_quant;
    error_quant = cmp_err_Total(image_src, image_decomp, numEntries, channels3or4);

    return error_quant;
}

INLINE CGV_FLOAT old_sq_image(CGV_FLOAT v)
{
    return v * v;
}


CGV_FLOAT cmp_shake3(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
                     CMP_IN CGV_Vec4f image_cluster[2],
                     CMP_IN CGU_UINT32 index_cidx[16],
                     CMP_IN CGV_Vec4ui image_src[16],
                     CMP_IN CGU_INT index_bits,
                     CMP_IN CGU_INT type,
                     CMP_IN CGU_UINT8 max_bits[4],
                     CMP_IN CGU_UINT8 use_par,
                     CMP_IN CGU_INT numEntries,  // max 16
                     CMP_IN CGU_UINT8 channels3or4)
{
    CGV_FLOAT best_err = CMP_FLOAT_MAX;

    CGV_FLOAT err_ed[16] = {0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f};
    CGU_INT   epo_code_par[2][2][2][MAX_CHANNELS];

    for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
    {
        CGU_UINT8 ppA = 0;
        CGU_UINT8 ppB = 0;
        CGU_UINT8 rr  = (use_par ? 2 : 1);
        CGU_INT   epo_code_epi[2][2];  // first/second, coord, begin rage end range

        for (ppA = 0; ppA < rr; ppA++)
        {  // loop max =2
            for (ppB = 0; ppB < rr; ppB++)
            {  //loop  max =2

                // set default ranges
                epo_code_epi[0][0] = epo_code_epi[0][1] = cmp_ep_find_floor2(image_cluster[0][ch], max_bits[ch], use_par, ppA);
                epo_code_epi[1][0] = epo_code_epi[1][1] = cmp_ep_find_floor2(image_cluster[1][ch], max_bits[ch], use_par, ppB);

                // set begin range
                epo_code_epi[0][0] -= ((epo_code_epi[0][0] < 1 ? epo_code_epi[0][0] : 1)) & (~use_par);
                epo_code_epi[1][0] -= ((epo_code_epi[1][0] < 1 ? epo_code_epi[1][0] : 1)) & (~use_par);

                // set end range
                epo_code_epi[0][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[0][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[0][1] : 2) & (~use_par);
                epo_code_epi[1][1] += ((1 << max_bits[ch]) - 1 - epo_code_epi[1][1] < 2 ? (1 << max_bits[ch]) - 1 - epo_code_epi[1][1] : 2) & (~use_par);

                CGU_INT step                       = (1 << use_par);
                err_ed[(ppA * 8) + (ppB * 4) + ch] = CMP_FLOAT_MAX;

                for (CGU_INT epo_p1 = epo_code_epi[0][0]; epo_p1 <= epo_code_epi[0][1]; epo_p1 += step)
                {
                    for (CGU_INT epo_p2 = epo_code_epi[1][0]; epo_p2 <= epo_code_epi[1][1]; epo_p2 += step)
                    {
                        CGV_FLOAT image_square_diff = 0.0F;
                        CGU_INT   _mc               = numEntries;
                        CGV_FLOAT image_ramp;

                        while (_mc > 0)
                        {
                            image_ramp = cmp_GetRamp(index_bits, max_bits[ch], epo_p1, epo_p2, index_cidx[_mc - 1]);

                            image_square_diff += cmp_squaref(image_ramp - image_src[(_mc - 1)][ch]);
                            _mc--;
                        }
                        if (image_square_diff < err_ed[(ppA * 8) + (ppB * 4) + ch])
                        {
                            err_ed[(ppA * 8) + (ppB * 4) + ch] = image_square_diff;
                            epo_code_par[ppA][ppB][0][ch]      = epo_p1;
                            epo_code_par[ppA][ppB][1][ch]      = epo_p2;
                        }
                    }
                }
            }  // pp1
        }      // pp0
    }          // j

    //---------------------------------------------------------
    for (CGU_INT pn = 0; pn < cmp_npv_nd[channels3or4 - 3][type]; pn++)
    {
        CGV_FLOAT err_2 = 0.0F;
        CGU_INT   d1;
        CGU_INT   d2;

        for (CGU_UINT8 ch = 0; ch < channels3or4; ch++)
        {
            d1 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][ch];
            d2 = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][ch];
            err_2 += err_ed[(d1 * 8) + (d2 * 4) + ch];
        }

        if (err_2 < best_err)
        {
            best_err             = err_2;
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][0];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][0];
            epo_code_shake[0][0] = epo_code_par[d1][d2][0][0];
            epo_code_shake[1][0] = epo_code_par[d1][d2][1][0];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][1];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][1];
            epo_code_shake[0][1] = epo_code_par[d1][d2][0][1];
            epo_code_shake[1][1] = epo_code_par[d1][d2][1][1];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][2];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][2];
            epo_code_shake[0][2] = epo_code_par[d1][d2][0][2];
            epo_code_shake[1][2] = epo_code_par[d1][d2][1][2];
            d1                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][0][3];
            d2                   = cmp_par_vectors_nd[channels3or4 - 3][type][pn][1][3];
            epo_code_shake[0][3] = epo_code_par[d1][d2][0][3];
            epo_code_shake[1][3] = epo_code_par[d1][d2][1][3];
        }
    }

    return best_err;
}

                     CGV_FLOAT cmp_requantized_index(CMP_INOUT CGU_UINT8 index_out[16],
                                CMP_INOUT CGU_Vec4ui epo_code_best[2],
                                CMP_IN CGU_INT index_bits,
                                CMP_IN CGU_UINT8 max_bits[4],
                                CMP_IN CGV_Vec4ui image_src[16],
                                CMP_IN CGU_INT numEntries,
                                CMP_IN CGU_UINT8 channels3or4)
{
    //CGV_Vec4f image_requantize[16];
    //CGV_FLOAT err_r = 0.0F;
    CGU_UINT8 k;
    CGU_UINT8 ch;

    // for (k = 0; k < 16; k++)
    // {
    //     image_requantize[k][0] = cmp_GetRamp(index_bits, max_bits[0], epo_code_best[0][0], epo_code_best[1][0], k);
    //     image_requantize[k][1] = cmp_GetRamp(index_bits, max_bits[1], epo_code_best[0][1], epo_code_best[1][1], k);
    //     image_requantize[k][2] = cmp_GetRamp(index_bits, max_bits[2], epo_code_best[0][2], epo_code_best[1][2], k);
    //     if (channels3or4 == 4)
    //         image_requantize[k][3] = cmp_GetRamp(index_bits, max_bits[3], epo_code_best[0][3], epo_code_best[1][3], k);
    //     else
    //         image_requantize[k][3] = 0.0f;
    // }

        //=========================================
    // requantized image based on new epo_code
    //=========================================
    CGV_FLOAT image_requantize[SOURCE_BLOCK_SIZE][MAX_CHANNELS];
    CGV_FLOAT err_r = 0.0F;

    for (ch = 0; ch < channels3or4; ch++)
    {
        for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
        {
            image_requantize[k][ch] = cmp_GetRamp(index_bits, max_bits[ch], epo_code_best[0][ch], epo_code_best[1][ch], k);
        }
    }



    //=========================================
    // Calc the error for the requantized image
    //=========================================
    CGV_Vec4f imageDiff;

    //CGU_UINT8 block_entries = (1 << index_bits);
    //
    // for (k = 0; k < numEntries; k++)
    // {
    //     CGV_FLOAT err_cmin   = 262145.0f;  // (256 * 256 * 4) + 1;  CMP_FLOAT_MAX;
    //     CGU_UINT8 hold_index = 0;
    //     CGV_FLOAT image_err;
    // 
    //     for (CGU_UINT8 k1 = 0; k1 < block_entries; k1++)
    //     {
    //         imageDiff.x = image_requantize[k1].x - image_src[k].x;
    //         imageDiff.y = image_requantize[k1].y - image_src[k].y;
    //         imageDiff.z = image_requantize[k1].z - image_src[k].z;
    //         imageDiff.w = image_requantize[k1].w - image_src[k].w;
    //         image_err   = cmp_dot4f(imageDiff, imageDiff);
    // 
    //         if (image_err < err_cmin)
    //         {
    //             err_cmin   = image_err;
    //             hold_index = k1;
    //         }
    //     }
    // 
    //     index_out[k] = hold_index;
    //     err_r += err_cmin;
    // }

    //=========================================
    // Calc the error for the requantized image
    //=========================================

    for (k = 0; k < numEntries; k++)
    {
        CGV_FLOAT err_cmin     = CMP_FLOAT_MAX;
        CGV_INT   hold_index_j = 0;

        for (CGV_INT iclogBC7 = 0; iclogBC7 < (1 << index_bits); iclogBC7++)
        {
            CGV_FLOAT image_err = 0.0F;

            for (ch = 0; ch < channels3or4; ch++)
            {
                image_err += old_sq_image(image_requantize[iclogBC7][ch] - image_src[k][ch]);
            }

            if (image_err < err_cmin)
            {
                err_cmin     = image_err;
                hold_index_j = iclogBC7;
            }
        }

        index_out[k] = (CGV_UINT8)hold_index_j;
        err_r += err_cmin;
    }

    return err_r;
}


CGV_FLOAT cmp_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],
                                         CMP_INOUT CGU_UINT32 index_io[16],
                                         CMP_INOUT CGU_UINT32 index_packed_out[2],
                                         CMP_IN CGV_Vec4ui image_src[16],
                                         CMP_IN CGU_INT numEntries,
                                         CMP_IN CGU_UINT8 Mi_,
                                         CMP_IN CGU_UINT8 bits,
                                         CMP_IN CGU_UINT8 channels3or4,
                                         CMP_IN CGU_FLOAT errorThreshold,
                                         CMP_IN CGU_INT blockMode)
{
    CGV_FLOAT err_best = CMP_FLOAT_MAX;
    CGU_INT   type;
    CGU_UINT8 channels2 = 2 * channels3or4;
    type                = bits % channels2;

    CGU_UINT8 use_par = (type != 0);

    CGU_UINT8 max_bits[4] = {0, 0, 0, 0};
    CGU_UINT8 ch;
    CGU_INT   k;

    for (ch = 0; ch < channels3or4; ch++)
        max_bits[ch] = (bits + channels2 - 1) / channels2;

    CGU_INT index_bits = g_modesettings[blockMode].indexBits;
    CGU_INT clt_clogBC7 = index_bits - 2;
    
    if (clt_clogBC7 > 3)
        return CMP_FLOAT_MAX;
   
    Mi_ = Mi_ - 1;

    CGU_UINT32 index_tmp[16];
    CGU_INT    maxTry = MAX_TRY_SHAKER;

    for (k = 0; k < numEntries; k++)
        index_tmp[k] = cmp_clampui8(index_io[k], 0, 15);

    epo_code_out[0] = 0u;
    epo_code_out[1] = 0u;

    CGV_FLOAT err_requant = 0.0F;

    CGU_UINT8 MaxIndex;

    MaxIndex = index_collapse2(index_tmp, numEntries);

    //===============================
    // we have a solid color 4x4 block
    //===============================
    if (MaxIndex == 0)
    {
        return cmp_quant_solid_color(index_io, epo_code_out, image_src, numEntries, Mi_, max_bits, type, channels3or4, blockMode);
    }

    for (CGU_INT ii = 0; ii < maxTry; ii++)
    {
        //===============================
        // We have ramp colors to process
        //===============================
        CGV_FLOAT  err_cluster = CMP_FLOAT_MAX;
        CGV_FLOAT  err_shake;
        CGU_UINT32 index_cluster[16];
        CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

        for (CGU_UINT8 ii2 = 0; ii2 < numEntries; ii2++)
            index_cluster[ii2] = 0;
        CGU_UINT8 mi = Mi_;

        for (CGU_UINT8 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
        {
            CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

            for (CGU_UINT8 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
            {
                //-------------------------------------
                // set a new index data to try
                //-------------------------------------
                for (k = 0; k < numEntries; k++)
                    index_cluster[k] = index_tmp[k] * index_slope + index_offset;

                if (cmp_get_ideal_cluster(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
                {
                    CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

                    err_shake = cmp_shake3( epo_code_shake, 
                                            image_cluster, 
                                            index_cluster, 
                                            image_src, 
                                            index_bits, 
                                            type, 
                                            max_bits, 
                                            use_par, 
                                            numEntries, 
                                            channels3or4);

                    if (err_shake < err_cluster)
                    {
                        err_cluster      = err_shake;
                        epo_code_best[0] = epo_code_shake[0];
                        epo_code_best[1] = epo_code_shake[1];
                    }
                }
            }
        }

        if ((err_cluster != CMP_FLOAT_MAX))
        {
            //=========================
            // test results for quality
            //=========================
            CGU_UINT8 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
            err_requant              = cmp_requantized_index(index_best, 
                                                             epo_code_best, 
                                                             index_bits, 
                                                             max_bits, 
                                                             image_src, 
                                                             numEntries, 
                                                             channels3or4);
            if (err_requant < err_best)
            {
                //better = 1;
                for (k = 0; k < numEntries; k++)
                    index_io[k] = index_tmp[k] = index_best[k];

                cmp_pack4bitindex32(index_packed_out, index_io);
                epo_code_out[0] = epo_code_best[0];
                epo_code_out[1] = epo_code_best[1];
                err_best        = err_requant;
            }
        }

        // Early out if we have our target err
        if (err_best <= errorThreshold)
            break;

        MaxIndex = index_collapse2(index_tmp, numEntries);
        if (MaxIndex == 0)
            break;
    }

    return err_best;
}

CGU_UINT8 cmp_Write8Bit2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
{
    base[offset / 8] |= bitVal << (offset % 8);
    if (offset % 8 + bits > 8)
    {
        base[offset / 8 + 1] |= shift_right_uint82(bitVal, 8 - offset % 8);
    }
    return (offset += bits);
}

INLINE CGU_UINT8 shift_right_uint8V2(CMP_IN CGU_UINT8 v, CMP_IN CGU_UINT8 bits)
{
    return v >> bits;  // (perf warning expected)
}

void cmp_Write8BitV2(CMP_INOUT CGU_UINT8 base[16], CMP_IN CGU_INT offset, CMP_IN CGU_INT bits, CMP_IN CGU_UINT8 bitVal)
{
    base[offset / 8] |= bitVal << (offset % 8);
    if (offset % 8 + bits > 8)
    {
        base[offset / 8 + 1] |= shift_right_uint8V2(bitVal, 8 - offset % 8);
    }
}


void cmp_Encode_mode01237(CMP_IN CGU_INT blockMode,
                          CMP_IN CGU_UINT8 bestPartition,
                          CMP_IN CGU_UINT32 packedEndpoints[6],
                          CMP_IN CGU_UINT8 index16[16],
                          CMP_INOUT CGU_UINT8 cmp_out[16])
{
    CGU_UINT8    blockindex[SOURCE_BLOCK_SIZE];
    CGU_UINT32   indexBitsV = g_modesettings[blockMode].indexBits;
    CGU_UINT32   k;
    CGU_UINT32   ch;

    for (k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 0 = 1, mode 1 = 01, mode 2 = 001, mode 3 = 0001, ...
    CGU_INT bitPosition = blockMode;
    bitPosition         = cmp_Write8Bit2(cmp_out, bitPosition, 1, 1);

    // Write partition bits
    bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].partitionBits, bestPartition);

    // Sort out the index set and tag whether we need to flip the
    // endpoints to get the correct state in the implicit index bits
    // The implicitly encoded MSB of the fixup index must be 0
    CGU_UINT32 fixup[3] = {0, 0, 0};
    cmp_get_fixuptable(fixup, (g_modesettings[blockMode].maxSubSets == 2 ? bestPartition : bestPartition + 64));

    // Extract indices and mark subsets that need to have their colours flipped to get the
    // right state for the implicit MSB of the fixup index
    CGU_INT flipColours[3] = {0, 0, 0};

    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        blockindex[k] = index16[k];
        for (CGU_UINT8 j = 0; j < g_modesettings[blockMode].maxSubSets; j++)
        {
            if (k == fixup[j])
            {
                if (blockindex[k] & (1 << (indexBitsV - 1)))
                {
                    flipColours[j] = 1;
                }
            }
        }
    }

    // Now we must flip the endpoints where necessary so that the implicitly encoded
    // index bits have the correct state
    for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
    {
        if (flipColours[k] == 1)
        {
            CGU_UINT32 temp            = packedEndpoints[k * 2 + 0];
            packedEndpoints[k * 2 + 0] = packedEndpoints[k * 2 + 1];
            packedEndpoints[k * 2 + 1] = temp;
        }
    }

    // ...next flip the indices where necessary

    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);

        if (flipColours[partsub] == 1)
        {
            blockindex[k] = ((1 << indexBitsV) - 1) - blockindex[k];
        }
    }

    // Endpoints are stored in the following order RRRR GGGG BBBB (AAAA) (PPPP)
    // i.e. components are packed together
    CGU_Vec4ui unpackedColours[MAX_SUBSETS * 2];
    CGU_UINT8  parityBits[MAX_SUBSETS][2];

    // Init
    for (k = 0; k < MAX_SUBSETS * 2; k++)
        unpackedColours[k] = 0;

    // Unpack the colour values for the subsets
    for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
    {
        CGU_UINT32 packedColours[2] = {packedEndpoints[k * 2 + 0], packedEndpoints[k * 2 + 1]};

        if (blockMode == 0 || blockMode == 3 || blockMode == 7)
        {  // TWO_PBIT
            parityBits[k][0] = packedColours[0] & 1;
            parityBits[k][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else if (blockMode == 1)
        {  // ONE_PBIT
            parityBits[k][0] = packedColours[1] & 1;
            parityBits[k][1] = packedColours[1] & 1;
            packedColours[0] >>= 1;
            packedColours[1] >>= 1;
        }
        else if (blockMode == 2)
        {
            parityBits[k][0] = 0;
            parityBits[k][1] = 0;
        }

        for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
        {
            unpackedColours[k * 2][ch]     = packedColours[0] & ((1 << g_modesettings[blockMode].componentBits) - 1);
            unpackedColours[k * 2 + 1][ch] = packedColours[1] & ((1 << g_modesettings[blockMode].componentBits) - 1);
            packedColours[0] >>= g_modesettings[blockMode].componentBits;
            packedColours[1] >>= g_modesettings[blockMode].componentBits;
        }
    }

    // Loop over component
    for (ch = 0; ch < g_modesettings[blockMode].channels3or4; ch++)
    {
        // loop over subsets
        for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
        {
            bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2][ch] & 0xFF);
            bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, g_modesettings[blockMode].componentBits, unpackedColours[k * 2 + 1][ch] & 0xFF);
        }
    }

    // write parity bits
    if (blockMode != 2)
    {
        for (k = 0; k < g_modesettings[blockMode].maxSubSets; k++)
        {
            if (blockMode == 1)
            {  // ONE_PBIT
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
            }
            else
            {  // TWO_PBIT
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][0] & 0x01);
                bitPosition = cmp_Write8Bit2(cmp_out, bitPosition, 1, parityBits[k][1] & 0x01);
            }
        }
    }

    // Encode the index bits
    CGU_INT bitPositionV = bitPosition;
    for (k = 0; k < 16; k++)
    {
        CGU_UINT8 partsub = cmp_get_partition_subset2(bestPartition, g_modesettings[blockMode].maxSubSets, k);

        // If this is a fixup index then drop the MSB which is implicitly 0
        if (k == fixup[partsub])
        {
            cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits - 1, blockindex[k] & 0x07F);
            bitPositionV += g_modesettings[blockMode].indexBits - 1;
        }
        else
        {
            cmp_Write8BitV2(cmp_out, bitPositionV, g_modesettings[blockMode].indexBits, blockindex[k]);
            bitPositionV += g_modesettings[blockMode].indexBits;
        }
    }
}


CGV_FLOAT cmp_process_mode(CMP_INOUT CGU_UINT32 best_cmp_out[5], CMP_IN CGU_Vec4ui image_src[16], CMP_IN CGU_INT block_mode)
{
#ifdef USE_OLDCODE
    CGV_FLOAT  best_err = 1e30f;
    CGU_Vec4ui epo_code[6];
    CGU_Vec4ui bestEndpoints[6];
    CGU_UINT8  bestindex[3][16];
    CGU_INT    bestEntryCount[3];
    CGU_UINT8  bestindex16[16];
    CGU_UINT32 packedEndpoints[6] = {0, 0, 0, 0, 0, 0};

    CGU_UINT32 k;
    CGU_UINT32 ch;
    CGU_UINT32 subset;

    // Check for a solid color for a fast encode
    CGV_Vec4ui mean_out = 0.0f;

    for (k = 0; k < 16; k++)
    {
        mean_out       = mean_out + image_src[k];
        bestindex16[k] = 0;
    }

    mean_out = mean_out / 16;

    // Image has alpha
    if (mean_out.w < 255)
    {
    }

    CGU_UINT8 storedBestindex[64][3][16];
    CGV_FLOAT storedError[64];
    CGU_UINT8 sortedPartition[64];

    CGV_FLOAT   quality    = 1.0f;
    CGV_FLOAT   opaque_err = 0.0f;
    CGV_Vec4ui  image_subsets[3][16];
    CGU_INT     subset_entryCount[MAX_SUBSETS] = {0, 0, 0};
    CGU_UINT8   bestPartition = 0;

    for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
    {
        cmp_GetPartitionSubSet2_mode01237(
            image_subsets, subset_entryCount, mode_blockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);

        CGV_Vec4ui subset_image_src[16];
        CGU_UINT8  index_out1[16];
        CGV_FLOAT  err_quant = 0.0F;

        // Store the quntize error for this partition to be sorted and processed later
        for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
        {
            CGU_INT numEntries = subset_entryCount[subset];

            for (CGU_UINT8 ii = 0; ii < 16; ii++)
                subset_image_src[ii] = image_subsets[subset][ii];

            err_quant += cmp_GetQuantizeIndex_old(
                index_out1, subset_image_src, numEntries, g_modesettings[block_mode].clusters, g_modesettings[block_mode].channels3or4);

            for (CGU_UINT8 idx = 0; idx < numEntries; idx++)
                storedBestindex[mode_blockPartition][subset][idx] = index_out1[idx];
        }

        storedError[mode_blockPartition] = err_quant;
    }

    // Sort the results
    cmp_sortPartitionProjection(storedError, sortedPartition, 64);  // 64 partitions

    CGU_UINT8 numShakeAttempts = cmp_max8(1, cmp_min8((CGU_UINT8)floor(8 * quality + 0.5), 64));  // 64 partitions
    CGV_FLOAT err_best         = CMP_FLOAT_MAX;

    // Now do the endpoint shaking
    for (CGU_UINT8 nSA = 0; nSA < numShakeAttempts; nSA++)
    {
        CGV_FLOAT err_optimized = 0.0F;
        CGU_UINT8 sortedBlockPartition;
        sortedBlockPartition = sortedPartition[nSA];

        //********************************************
        // Get the partition shape for the given mode
        //********************************************
        cmp_GetPartitionSubSet2_mode01237(
            image_subsets, subset_entryCount, sortedBlockPartition, image_src, block_mode, g_modesettings[block_mode].channels3or4);

        //*****************************
        // Process the partition shape
        //*****************************
        for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
        {
            CGU_INT    numEntries = subset_entryCount[subset];
            CGU_UINT32 index_io[16];
            CGV_Vec4ui src_image_block[16];
            CGU_Vec4ui tmp_epo_code[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

            for (k = 0; k < 16; k++)
                src_image_block[k] = image_subsets[subset][k];

            for (k = 0; k < 16; k++)
                index_io[k] = storedBestindex[sortedBlockPartition][subset][k];

            CGU_UINT32 index_packed_out[2] = {0, 0};

            err_optimized += cmp_optimize_IndexAndEndPoints(tmp_epo_code,
                                                            index_io,
                                                            index_packed_out,
                                                            src_image_block,
                                                            numEntries,
                                                            g_modesettings[block_mode].clusters,
                                                            g_modesettings[block_mode].bits,
                                                            g_modesettings[block_mode].channels3or4,
                                                            0.01f,
                                                            1);

            for (k = 0; k < 16; k++)
                storedBestindex[sortedBlockPartition][subset][k] = index_io[k];

            epo_code[subset * 2]     = tmp_epo_code[0];
            epo_code[subset * 2 + 1] = tmp_epo_code[1];

            shared_temp[subset * 2].endPoint_low      = tmp_epo_code[0];
            shared_temp[subset * 2 + 1].endPoint_high = tmp_epo_code[1];
        }

        //****************************************
        // Check if result is better than the last
        //****************************************
        if (err_optimized < err_best)
        {
            bestPartition          = sortedBlockPartition;
            CGU_INT bestIndexCount = 0;

            for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
            {
                CGU_UINT32 numEntries     = subset_entryCount[subset];
                bestEntryCount[subset] = numEntries;

                if (numEntries)
                {
                    bestEndpoints[subset * 2]     = epo_code[subset * 2];
                    bestEndpoints[subset * 2 + 1] = epo_code[subset * 2 + 1];

                    shared_temp[subset * 2].endPoint_low      = bestEndpoints[subset * 2];
                    shared_temp[subset * 2 + 1].endPoint_high = bestEndpoints[subset * 2 + 1];

                    for (k = 0; k < numEntries; k++)
                    {
                        bestindex[subset][k]          = storedBestindex[sortedBlockPartition][subset][k];
                        bestindex16[bestIndexCount++] = storedBestindex[sortedBlockPartition][subset][k];
                        shared_temp[k].colorindex     = storedBestindex[sortedBlockPartition][subset][k];
                    }
                }
            }

            err_best = err_optimized;
            // Early out if we  found we can compress with error below the quality threshold
            if (err_best <= 0.01f)  // Thresh hold err
            {
                break;
            }
        }
    }

    if (block_mode != 7)
        err_best += opaque_err;

    if (err_best > best_err)
        return best_err;

    //**************************
    // Save the encoded block
    //**************************
    best_err = err_best;

    // Now we have all the data needed to encode the block
    // We need to pack the endpoints prior to encoding

    for (subset = 0; subset < g_modesettings[block_mode].maxSubSets; subset++)
    {
        packedEndpoints[subset * 2]     = 0;
        packedEndpoints[subset * 2 + 1] = 0;

        if (bestEntryCount[subset])
        {
            CGU_UINT32 rightAlignment = 0;

            // Sort out parity bits
            if (block_mode != 2)
            {
                // Sort out BCC parity bits
                packedEndpoints[subset * 2]     = bestEndpoints[subset * 2][0] & 1;
                packedEndpoints[subset * 2 + 1] = bestEndpoints[subset * 2 + 1][0] & 1;
                for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
                {
                    bestEndpoints[subset * 2][ch] >>= 1;
                    bestEndpoints[subset * 2 + 1][ch] >>= 1;
                }
                rightAlignment++;
            }

            // Fixup endpoints
            for (ch = 0; ch < g_modesettings[block_mode].channels3or4; ch++)
            {
                packedEndpoints[subset * 2] |= bestEndpoints[subset * 2][ch] << rightAlignment;
                packedEndpoints[subset * 2 + 1] |= bestEndpoints[subset * 2 + 1][ch] << rightAlignment;
                rightAlignment += g_modesettings[block_mode].componentBits;
            }
        }
    }

    CGU_UINT8 idxCount[3] = {0, 0, 0};
    for (k = 0; k < SOURCE_BLOCK_SIZE; k++)
    {
        CGU_UINT8 partsub         = cmp_get_partition_subset2(bestPartition, g_modesettings[block_mode].maxSubSets, k);
        CGU_UINT8 idxC            = idxCount[partsub];
        bestindex16[k]            = bestindex[partsub][idxC];
        idxCount[partsub]         = idxC + 1;
        shared_temp[k].colorindex = bestindex16[k];
    }

    CGU_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE];
    cmp_Encode_mode01237(block_mode, bestPartition, packedEndpoints, bestindex16, cmp_out);
    
    best_cmp_out[0] = (CGU_UINT32)cmp_out[0] + (CGU_UINT32)(cmp_out[1] << 8) + (CGU_UINT32)(cmp_out[2] << 16) + (CGU_UINT32)(cmp_out[3] << 24);
    best_cmp_out[1] = (CGU_UINT32)cmp_out[4] + (CGU_UINT32)(cmp_out[5] << 8) + (CGU_UINT32)(cmp_out[6] << 16) + (CGU_UINT32)(cmp_out[7] << 24);
    best_cmp_out[2] = (CGU_UINT32)cmp_out[8] + (CGU_UINT32)(cmp_out[9] << 8) + (CGU_UINT32)(cmp_out[10] << 16) + (CGU_UINT32)(cmp_out[11] << 24);
    best_cmp_out[3] = (CGU_UINT32)cmp_out[12] + (CGU_UINT32)(cmp_out[13] << 8) + (CGU_UINT32)(cmp_out[14] << 16) + (CGU_UINT32)(cmp_out[15] << 24);

    //CGU_Vec4ui block = {0, 0, 0, 0};
    //block_package1(block, bestPartition, 0);
    //best_cmp_out[0] = block[0];
    //best_cmp_out[1] = block[1];
    //best_cmp_out[2] = block[2];
    //best_cmp_out[3] = block[3];
    //
    //printSharedTemp();

    return best_err;
#else
    CGU_UINT8 bestPartition = 0;
    // Find the best partion
    CGU_UINT32 pbit = 0;
    CGU_UINT32 error;
    CGU_UINT32 bestErr  = MAX_UINT;
    CGU_UINT32 bestpbit = 0;
    for (CGU_UINT8 mode_blockPartition = 0; mode_blockPartition < 64; mode_blockPartition++)
    {
        error = cmp_GetPartitionError(pbit, mode_blockPartition, image_src);
        if (error < bestErr)
        {
            bestErr       = error;
            bestpbit      = pbit;
            bestPartition = mode_blockPartition;
        }
    }

    // Get the index for the partition
    for (CGU_INT threadInBlock = 15; threadInBlock >= 0; threadInBlock--)
    {
        ProcessBlock(1, bestPartition, 0, bestpbit, 0, threadInBlock, threadInBlock, 0);
    }

    // print results for debug
    printSharedTemp();

    //=======================
    // Encode final block
    //========================
    {
        // CGU_Vec4ui blockGreen = {0xffe00040, 0xfffe0007, 0x00000001, 0x00000000};
        // CGU_Vec4ui blockBlue  = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};
        // CGU_Vec4ui block00    = {0xf0617fc0, 0xfffe0c3f, 0xff00fe11, 0xff01ef00};
        CGU_Vec4ui blockRed   = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
        CGU_Vec4ui block      = {0, 0, 0, 0};
        CGU_UINT32 input_mode = 1;
        switch (input_mode)
        {
        case 1:
            block_package1(block, bestPartition, 0);
            break;
        case 3:
            block_package3(block, bestPartition, 0);
            break;
        case 7:
            block_package7(block, bestPartition, 0);
            break;
        default:  // error unsupported mode used!
            block = blockRed;
            break;
        }

        best_cmp_out[0] = block[0];
        best_cmp_out[1] = block[1];
        best_cmp_out[2] = block[2];
        best_cmp_out[3] = block[3];
    }

    return 0.0f;
#endif
}
#endif // Not ASPM_HLSL

//======================================= MODES 45 =============================================
#ifndef ASPM_HLSL
#if defined(ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)

// Compression Results
struct cmp_mode_parameters2
{
    CGV_INT      color_qendpoint[8];
    CGV_INT      alpha_qendpoint[8];
    CGV_UINT8    color_index[16];
    CGV_UINT8    alpha_index[16];
    CGV_UINT32   idxMode;
    CGV_UINT32   rotated_channel;
};

CMP_STATIC CMP_CONSTANT CGU_UINT8  componentRotations2[4][4] = {
    { COMP_ALPHA, COMP_RED,   COMP_GREEN, COMP_BLUE },
    { COMP_RED,   COMP_ALPHA, COMP_GREEN, COMP_BLUE },
    { COMP_GREEN, COMP_RED,   COMP_ALPHA, COMP_BLUE },
    { COMP_BLUE,  COMP_RED,   COMP_GREEN, COMP_ALPHA }
};

INLINE CGV_UINT8 old_shift_right_uint(CGV_UINT8 v, CGU_UINT8 bits)
{
    return v >> bits;  // (perf warning expected)
}

void old_Write8Bit(CGV_UINT8 base[], CGU_INT* uniform offset, CGU_INT bits, CGV_UINT8 bitVal)
{
    base[*offset / 8] |= bitVal << (*offset % 8);
    if (*offset % 8 + bits > 8)
    {
        base[*offset / 8 + 1] |= old_shift_right_uint(bitVal, 8 - *offset % 8);
    }
    *offset += bits;
}

INLINE void old_swap_index(CGV_UINT8 u[], CGV_UINT8 v[], CGU_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_UINT8 t = u[i];
        u[i]        = v[i];
        v[i]        = t;
    }
}

INLINE void old_swap_epo(CGV_INT u[], CGV_INT v[], CGV_INT n)
{
    for (CGU_INT i = 0; i < n; i++)
    {
        CGV_INT t = u[i];
        u[i]      = v[i];
        v[i]      = t;
    }
}

INLINE void old_encode_swap(CGV_INT endpoint[], CGU_INT channels, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
    CGU_INT levels = 1 << bits;
    if (block_index[0] >= levels / 2)
    {
        old_swap_epo(&endpoint[0], &endpoint[channels], channels);
        for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
#ifdef ASPM_GPU
            block_index[k] = (levels - 1) - block_index[k];
#else
            block_index[k] = CGV_UINT8(levels - 1) - block_index[k];
#endif
    }
}

void old_encode_index(CGV_UINT8 data[16], CGU_INT* uniform pPos, CGV_UINT8 block_index[MAX_SUBSET_SIZE], CGU_INT bits)
{
    old_Write8Bit(data, pPos, bits - 1, block_index[0]);
    for (CGU_INT j = 1; j < SOURCE_BLOCK_SIZE; j++)
    {
        CGV_UINT8 qbits = block_index[j] & 0xFF;
        old_Write8Bit(data, pPos, bits, qbits);
    }
}

void cmp_Encode_mode4(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
{
    CGU_INT bitPosition = 4;  // Position the pointer at the LSB

    for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 4 (5 bits) 00001
    old_Write8Bit(cmp_out, &bitPosition, 1, 1);

    // rotation 2 bits
    old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));

    // idxMode 1 bit
    old_Write8Bit(cmp_out, &bitPosition, 1, CMP_STATIC_CAST(CGV_UINT8, params.idxMode));

    CGU_INT idxBits[2] = {2, 3};

    if (params.idxMode)
    {
        idxBits[0] = 3;
        idxBits[1] = 2;
        // Indicate if we need to fixup the index
        old_swap_index(params.color_index, params.alpha_index, 16);
        old_encode_swap(params.alpha_qendpoint, 4, params.color_index, 2);
        old_encode_swap(params.color_qendpoint, 4, params.alpha_index, 3);
    }
    else
    {
        old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
        old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 3);
    }

    // color endpoints 5 bits each
    // R0 : R1
    // G0 : G1
    // B0 : B1
    for (CGU_INT component = 0; component < 3; component++)
    {
        old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
        old_Write8Bit(cmp_out, &bitPosition, 5, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
    }

    // alpha endpoints (6 bits each)
    // A0 : A1
    old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
    old_Write8Bit(cmp_out, &bitPosition, 6, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));

    // index 2 bits each  (31 bits total)
    old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
    // index 3 bits each  (47 bits total)
    old_encode_index(cmp_out, &bitPosition, params.alpha_index, 3);
}

void cmp_Encode_mode5(CMP_INOUT CGV_UINT8 cmp_out[COMPRESSED_BLOCK_SIZE], cmp_mode_parameters2 params)
{
    for (CGU_INT k = 0; k < COMPRESSED_BLOCK_SIZE; k++)
        cmp_out[k] = 0;

    // mode 5 bits = 000001
    CGU_INT bitPosition = 5;  // Position the pointer at the LSB
    old_Write8Bit(cmp_out, &bitPosition, 1, 1);

    // Write 2 bit rotation
    old_Write8Bit(cmp_out, &bitPosition, 2, CMP_STATIC_CAST(CGV_UINT8, params.rotated_channel));

    old_encode_swap(params.color_qendpoint, 4, params.color_index, 2);
    old_encode_swap(params.alpha_qendpoint, 4, params.alpha_index, 2);

    // color endpoints (7 bits each)
    // R0 : R1
    // G0 : G1
    // B0 : B1
    for (CGU_INT component = 0; component < 3; component++)
    {
        old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[component]));
        old_Write8Bit(cmp_out, &bitPosition, 7, CMP_STATIC_CAST(CGV_UINT8, params.color_qendpoint[4 + component]));
    }

    // alpha endpoints (8 bits each)
    // A0 : A1
    old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[0]));
    old_Write8Bit(cmp_out, &bitPosition, 8, CMP_STATIC_CAST(CGV_UINT8, params.alpha_qendpoint[4]));

    // color index 2 bits each  (31 bits total)
    // alpha index 2 bits each  (31 bits total)
    old_encode_index(cmp_out, &bitPosition, params.color_index, 2);
    old_encode_index(cmp_out, &bitPosition, params.alpha_index, 2);
}

void Compress_mode45(CMP_INOUT CGU_UINT32 cmp_out[4], CGU_INT blockMode, CGU_Vec4ui image_src[SOURCE_BLOCK_SIZE])
{
    cmp_mode_parameters2 best_candidate;
    CGU_UINT32      channels3or4 = 4;
    CGU_UINT8       numClusters0[2];
    CGU_UINT8       numClusters1[2];
    CGU_INT         modeBits[2];
    CGU_INT         max_idxMode;

    if (blockMode == 4)
    {
        max_idxMode     = 2;
        modeBits[0]     = 30;  // bits = 2 * (Red 5+ Grn 5+ blu 5)
        modeBits[1]     = 36;  // bits = 2 * (Alpha 6+6+6)
        numClusters0[0] = 4;
        numClusters0[1] = 8;
        numClusters1[0] = 8;
        numClusters1[1] = 4;
    }
    else
    {
        max_idxMode     = 1;
        modeBits[0]     = 42;  // bits = 2 * (Red 7+ Grn 7+ blu 7)
        modeBits[1]     = 48;  // bits = 2 * (Alpha 8+8+8) = 48
        numClusters0[0] = 4;
        numClusters0[1] = 4;
        numClusters1[0] = 4;
        numClusters1[1] = 4;
    }

    CGU_Vec4ui src_color_Block[SOURCE_BLOCK_SIZE];
    CGU_Vec4ui src_alpha_Block[SOURCE_BLOCK_SIZE];
    CGV_FLOAT  best_err  = CMP_FLOAT_MAX;

    // Go through each possible rotation and selection of index rotationBits)
    for (CGU_UINT8 rotated_channel = 0; rotated_channel < channels3or4; rotated_channel++)
    {
        // A

        for (CGU_INT k = 0; k < SOURCE_BLOCK_SIZE; k++)
        {
            for (CGU_INT p = 0; p < 3; p++)
            {
                src_color_Block[k][p] = image_src[k][componentRotations2[rotated_channel][p+1]];
                src_alpha_Block[k][p] = image_src[k][componentRotations2[rotated_channel][0]];
            }
            src_color_Block[k][3] = image_src[k][3];
            src_alpha_Block[k][3] = image_src[k][componentRotations2[3][3]];
        }

        CGV_FLOAT err_quantizer;
        CGV_FLOAT err_bestQuantizer = CMP_FLOAT_MAX;

        for (CGU_INT idxMode = 0; idxMode < max_idxMode; idxMode++)
        {
            err_quantizer  = cmp_GetQuantizeIndex_old(best_candidate.color_index, src_color_Block, SOURCE_BLOCK_SIZE, numClusters0[idxMode], 3);

            err_quantizer += cmp_GetQuantizeIndex_old(best_candidate.alpha_index, src_alpha_Block, SOURCE_BLOCK_SIZE, numClusters1[idxMode], 3) / 3.0F;

            // If quality is high then run the full shaking for this config and
            // store the result if it beats the best overall error
            // Otherwise only run the shaking if the error is better than the best
            // quantizer error
            if (err_quantizer <= err_bestQuantizer)
            {
                err_bestQuantizer = err_quantizer;

                // Shake size gives the size of the shake cube
                CGV_FLOAT err_overallError;

                CGU_Vec4ui color_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
                CGV_Vec4ui src_image_block[16];
                CGU_Vec4ui alpha_qendpoint2[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

                CGU_UINT32 alpha_index[16];
                CGU_UINT32 color_index[16];

                for (int k = 0; k < 16; k++) {
                    alpha_index[k] = best_candidate.alpha_index[k];
                    color_index[k] = best_candidate.color_index[k];
                }

                CGU_UINT32 color_index_packed_out[2] = {0, 0};
                CGU_UINT32 alpha_index_packed_out[2] = {0, 0};

                err_overallError = cmp_optimize_IndexAndEndPoints(color_qendpoint2,
                                                                color_index,
                                                                color_index_packed_out,
                                                                src_color_Block,
                                                                16,
                                                                numClusters0[idxMode],
                                                                modeBits[0],
                                                                3,
                                                                0.01f,
                                                                blockMode);

                 // Alpha scalar block
                err_overallError += cmp_optimize_IndexAndEndPoints(alpha_qendpoint2,
                                                                alpha_index,
                                                                alpha_index_packed_out,
                                                                src_alpha_Block,
                                                                16,
                                                                numClusters1[idxMode],
                                                                modeBits[1],
                                                                3,
                                                                0.01f,
                                                                blockMode) / 3;


                // If we beat the previous best then encode the block
                if (err_overallError < best_err)
                {
                    best_err = err_overallError;
                    best_candidate.idxMode         = idxMode;
                    best_candidate.rotated_channel = rotated_channel;

                    best_candidate.alpha_qendpoint[0] = alpha_qendpoint2[0].x;
                    best_candidate.alpha_qendpoint[1] = alpha_qendpoint2[0].y;
                    best_candidate.alpha_qendpoint[2] = alpha_qendpoint2[0].z;
                    best_candidate.alpha_qendpoint[3] = alpha_qendpoint2[0].w;
                    best_candidate.alpha_qendpoint[4] = alpha_qendpoint2[1].x;
                    best_candidate.alpha_qendpoint[5] = alpha_qendpoint2[1].y;
                    best_candidate.alpha_qendpoint[6] = alpha_qendpoint2[1].z;
                    best_candidate.alpha_qendpoint[7] = alpha_qendpoint2[1].w;

                    best_candidate.color_qendpoint[0] = color_qendpoint2[0].x;
                    best_candidate.color_qendpoint[1] = color_qendpoint2[0].y;
                    best_candidate.color_qendpoint[2] = color_qendpoint2[0].z;
                    best_candidate.color_qendpoint[3] = color_qendpoint2[0].w;
                    best_candidate.color_qendpoint[4] = color_qendpoint2[1].x;
                    best_candidate.color_qendpoint[5] = color_qendpoint2[1].y;
                    best_candidate.color_qendpoint[6] = color_qendpoint2[1].z;
                    best_candidate.color_qendpoint[7] = color_qendpoint2[1].w;

                    for (int k = 0; k < 16; k++) {
                        best_candidate.color_index[k] = color_index[k];
                        best_candidate.alpha_index[k] = alpha_index[k];
                    }

                    CGV_UINT8 cmp_out16[COMPRESSED_BLOCK_SIZE];
                    if (blockMode == 4)
                        cmp_Encode_mode4(cmp_out16, best_candidate);
                    else
                        cmp_Encode_mode5(cmp_out16, best_candidate);

                   cmp_out[0] = (CGU_UINT32)cmp_out16[0]  + (CGU_UINT32)(cmp_out16[1] << 8)  + (CGU_UINT32)(cmp_out16[2] << 16)  + (CGU_UINT32)(cmp_out16[3] << 24);
                   cmp_out[1] = (CGU_UINT32)cmp_out16[4]  + (CGU_UINT32)(cmp_out16[5] << 8)  + (CGU_UINT32)(cmp_out16[6] << 16)  + (CGU_UINT32)(cmp_out16[7] << 24);
                   cmp_out[2] = (CGU_UINT32)cmp_out16[8]  + (CGU_UINT32)(cmp_out16[9] << 8)  + (CGU_UINT32)(cmp_out16[10] << 16) + (CGU_UINT32)(cmp_out16[11] << 24);
                   cmp_out[3] = (CGU_UINT32)cmp_out16[12] + (CGU_UINT32)(cmp_out16[13] << 8) + (CGU_UINT32)(cmp_out16[14] << 16) + (CGU_UINT32)(cmp_out16[15] << 24);

                }
            }
        }  // B
    }      // A
}

#endif
#endif

#ifdef ENABLE_CMP_REFINE_MODE6_API

CGU_BOOL get_ideal_cluster2(CMP_INOUT CGV_Vec4f image_cluster[2],
                            CMP_IN CGU_UINT32 index_cluster[16],
                            CMP_IN CGU_INT Mi_,
                            CMP_IN CGU_Vec4ui image_src[16],
                            CMP_IN CGU_UINT32 numEntries,
                            CMP_IN CGU_UINT32 channels3or4)
{
    // get ideal cluster centers
    CGV_Vec4f image_cluster_mean[16];

    for (CGU_UINT32 ii = 0; ii < 16; ii++)
    {
        image_cluster_mean[ii] = 0.0f;
    }

    GetClusterMean2(image_cluster_mean, image_src, index_cluster, numEntries, channels3or4);  // unrounded

    CGV_FLOAT image_matrix0[2] = {0, 0};  // matrix /inverse matrix
    CGV_FLOAT image_matrix1[2] = {0, 0};  // matrix /inverse matrix
    CGV_Vec4f image_rp[2];                // right part for RMS fit problem

    image_rp[0] = 0.0f;
    image_rp[1] = 0.0f;

    // weight with cnt if runnning on compacted index
    for (CGU_UINT32 k = 0; k < numEntries; k++)
    {
        image_matrix0[0] += (Mi_ - index_cluster[k]) * (Mi_ - index_cluster[k]);
        image_matrix0[1] += index_cluster[k] * (Mi_ - index_cluster[k]);  // im is symmetric
        image_matrix1[1] += index_cluster[k] * index_cluster[k];

        image_rp[0] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)(Mi_ - index_cluster[k]);
        image_rp[1] += image_cluster_mean[index_cluster[k]] * (CGU_FLOAT)index_cluster[k];
    }

    CGV_FLOAT matrix_dd = image_matrix0[0] * image_matrix1[1] - image_matrix0[1] * image_matrix0[1];

    // assert(matrix_dd !=0);
    // matrix_dd=0 means that index_cidx[k] and (Mi_-index_cidx[k]) collinear which implies only one active index;
    // taken care of separately
    if (matrix_dd == 0)
    {
        image_cluster[0] = 0.0f;
        image_cluster[1] = 0.0f;
        return FALSE;
    }

    image_matrix1[0] = image_matrix0[0];
    image_matrix0[0] = image_matrix1[1] / matrix_dd;
    image_matrix1[1] = image_matrix1[0] / matrix_dd;

    image_matrix1[0] = image_matrix0[1] = -image_matrix0[1] / matrix_dd;

    CGV_FLOAT Mif = (CGV_FLOAT)Mi_;

    // values can exceed 255 here, clamp made no diff in quality!
    image_cluster[0] = (((image_rp[0] * image_matrix0[0]) + (image_rp[1] * image_matrix0[1])) * Mif);
    image_cluster[1] = (((image_rp[0] * image_matrix1[0]) + (image_rp[1] * image_matrix1[1])) * Mif);

    return TRUE;
}



CGV_FLOAT shake2(CMP_INOUT CGU_Vec4ui epo_code_shake[2],
                 CMP_IN CGV_Vec4f image_cluster[2],
                 CMP_IN CGU_UINT32 index_cluster[16],
                 CMP_IN CGU_Vec4ui image_src[16],
                 CMP_IN CGU_UINT32 index_bits,
                 CMP_IN CGU_UINT32 mtype,
                 CMP_IN CGU_UINT32 max_bits[4],
                 CMP_IN CGU_UINT32 use_par,
                 CMP_IN CGU_UINT32 numEntries,  // max 16
                 CMP_IN CGU_UINT32 channels3or4)
{
    CMP_UNUSED(mtype);
    CGV_FLOAT best_err = CMP_FLOAT_MAX;

#define SHAKESIZE1 1
#define SHAKESIZE2 2
    // shake single or                                   - cartesian
    // shake odd/odd and even/even or                    - same parity
    // shake odd/odd odd/even , even/odd and even/even   - bcc

    CGV_FLOAT  err_ed[2][2][4];
    CGU_UINT32 epo_code_par[2][2][2][4];

    for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
    {
        CGU_UINT32 ppA = 0;
        CGU_UINT32 ppB = 0;
        CGU_UINT32 rr  = (use_par ? 2 : 1);
        CGU_UINT32 epo_code_epi0[2];  // first/second, coord, begin rage end range
        CGU_UINT32 epo_code_epi1[2];  // first/second, coord, begin rage end range

        for (ppA = 0; ppA < rr; ppA++)
        {  // loop max =2
            for (ppB = 0; ppB < rr; ppB++)
            {  //loop  max =2

                // set default ranges
                switch (ch)
                {
                case 0:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].x, max_bits[0], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].x, max_bits[0], use_par, ppB);
                    break;
                case 1:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].y, max_bits[1], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].y, max_bits[1], use_par, ppB);
                    break;
                case 2:
                    epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].z, max_bits[2], use_par, ppA);
                    epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].z, max_bits[2], use_par, ppB);
                    break;
                case 3:
                    if (channels3or4 == 4)
                    {
                        epo_code_epi0[0] = epo_code_epi0[1] = cmp_ep_find_floor2(image_cluster[0].w, max_bits[3], use_par, ppA);
                        epo_code_epi1[0] = epo_code_epi1[1] = cmp_ep_find_floor2(image_cluster[1].w, max_bits[3], use_par, ppB);
                    }
                    break;
                }

                // set begin range
                epo_code_epi0[0] -= ((epo_code_epi0[0] < SHAKESIZE1 ? epo_code_epi0[0] : SHAKESIZE1)) & (~use_par);
                epo_code_epi1[0] -= ((epo_code_epi1[0] < SHAKESIZE1 ? epo_code_epi1[0] : SHAKESIZE1)) & (~use_par);

                // set end range
                epo_code_epi0[1] +=
                    ((1 << max_bits[ch]) - 1 - epo_code_epi0[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi0[1] : SHAKESIZE2) & (~use_par);
                epo_code_epi1[1] +=
                    ((1 << max_bits[ch]) - 1 - epo_code_epi1[1] < SHAKESIZE2 ? (1 << max_bits[ch]) - 1 - epo_code_epi1[1] : SHAKESIZE2) & (~use_par);

                CGU_UINT32 step      = (1 << use_par);
                err_ed[ppA][ppB][ch] = CMP_FLOAT_MAX;

                for (CGU_UINT32 epo_p0 = epo_code_epi0[0]; epo_p0 <= epo_code_epi0[1]; epo_p0 += step)
                {
                    for (CGU_UINT32 epo_p1 = epo_code_epi1[0]; epo_p1 <= epo_code_epi1[1]; epo_p1 += step)
                    {
                        CGV_FLOAT image_square_diff = 0.0F;
                        CGV_FLOAT image_ramp;

                        for (CGU_UINT32 _mc = 1; _mc < numEntries; _mc++)
                        {
                            image_ramp = GetRamp2(epo_p0, epo_p1, index_cluster[_mc], index_bits);
                            switch (ch)
                            {
                            case 0:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].x);
                                break;
                            case 1:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].y);
                                break;
                            case 2:
                                image_square_diff += cmp_squaref(image_ramp - image_src[_mc].z);
                                break;
                            case 3:
                                if (channels3or4 == 4)
                                    image_square_diff += cmp_squaref(image_ramp - image_src[_mc].w);
                                break;
                            }
                        }

                        if (image_square_diff < err_ed[ppA][ppB][ch])
                        {
                            err_ed[ppA][ppB][ch]          = image_square_diff;
                            epo_code_par[ppA][ppB][0][ch] = epo_p0;
                            epo_code_par[ppA][ppB][1][ch] = epo_p1;
                        }
                    }
                }
            }  // pp1
        }      // pp0
    }          // j

    //---------------------------------------------------------
    // CMP_CONSTANT CGU_UINT8 npv_nd[2][8] = {
    //     {1, 2, 4, 8, 16, 32, 0, 0},  // 3 channel
    //     {1, 2, 4, 0, 0, 0, 0, 0}     // 4 channel  tyep index 0..7
    // };
    // for (CGU_INT pn = 0; pn < npv_nd[channels3or4 - 3][type]; pn++)
    CGU_UINT32 bits = 4;  // for mode 6 its 4
    for (CGU_UINT32 pn = 0; pn < bits; pn++)
    {
        CGV_FLOAT  err_2 = 0.0F;
        CGU_UINT32 d1    = 0;
        CGU_UINT32 d2    = 0;

        for (CGU_UINT32 ch = 0; ch < channels3or4; ch++)
        {
            d1 = par_vectors42_nd[pn][0][ch];
            d2 = par_vectors42_nd[pn][1][ch];
            err_2 += err_ed[d1][d2][ch];
        }

        if (err_2 < best_err)
        {
            best_err            = err_2;
            d1                  = par_vectors42_nd[pn][0][0];
            d2                  = par_vectors42_nd[pn][1][0];
            epo_code_shake[0].x = epo_code_par[d1][d2][0][0];
            epo_code_shake[1].x = epo_code_par[d1][d2][1][0];

            d1                  = par_vectors42_nd[pn][0][1];
            d2                  = par_vectors42_nd[pn][1][1];
            epo_code_shake[0].y = epo_code_par[d1][d2][0][1];
            epo_code_shake[1].y = epo_code_par[d1][d2][1][1];

            d1                  = par_vectors42_nd[pn][0][2];
            d2                  = par_vectors42_nd[pn][1][2];
            epo_code_shake[0].z = epo_code_par[d1][d2][0][2];
            epo_code_shake[1].z = epo_code_par[d1][d2][1][2];

            if (channels3or4 == 4)
            {
                d1                  = par_vectors42_nd[pn][0][3];
                d2                  = par_vectors42_nd[pn][1][3];
                epo_code_shake[0].w = epo_code_par[d1][d2][0][3];
                epo_code_shake[1].w = epo_code_par[d1][d2][1][3];
            }
        }
    }

    return best_err;
}

CGV_FLOAT requantized_image_err2(CMP_INOUT CGU_UINT32 index_best[16],
                                 CMP_IN CGU_Vec4ui epo_code_best[2],
                                 CMP_IN CGU_UINT32 index_bits,
                                 CMP_IN CGU_UINT32 max_bits[4],
                                 CMP_IN CGU_Vec4ui image_src[16],
                                 CMP_IN CGU_UINT32 numEntries,  // max 16
                                 CMP_IN CGU_UINT32 channels3or4)
{  // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)

    CMP_UNUSED(channels3or4);
    CMP_UNUSED(max_bits);

    //=========================================
    // requantized image based on new epo_code
    //=========================================
    CGV_Vec4f image_requantize[16];
    CGV_FLOAT err_requant = 0.0F;

    for (CGU_UINT32 k = 0; k < numEntries; k++)
    {
        image_requantize[k].x = GetRamp2(epo_code_best[0].x, epo_code_best[1].x, k, index_bits);
        image_requantize[k].y = GetRamp2(epo_code_best[0].y, epo_code_best[1].y, k, index_bits);
        image_requantize[k].z = GetRamp2(epo_code_best[0].z, epo_code_best[1].z, k, index_bits);
        image_requantize[k].w = GetRamp2(epo_code_best[0].w, epo_code_best[1].w, k, index_bits);
    }

    //=========================================
    // Calc the error for the requantized image
    //=========================================
    CGV_FLOAT  err_cmin;
    CGU_UINT32 best_indx;
    CGV_FLOAT  image_err;
    CGV_Vec4f  imageDiff;

    for (CGU_UINT32 k1 = 0; k1 < numEntries; k1++)
    {
        // start with error as sum of 4 channels with Max pixel
        // value 256 squared plus 1 for err min check = (256 * 256 * 4) + 1;
        err_cmin  = 262145.0f;
        best_indx = 0;

        for (CGU_UINT8 k2 = 0; k2 < numEntries; k2++)
        {
            image_err   = 0.0F;
            imageDiff.x = image_requantize[k2].x - image_src[k1].x;
            imageDiff.y = image_requantize[k2].y - image_src[k1].y;
            imageDiff.z = image_requantize[k2].z - image_src[k1].z;
            imageDiff.w = image_requantize[k2].w - image_src[k1].w;
            image_err   = cmp_dot4f(imageDiff, imageDiff);
            if (image_err < err_cmin)
            {
                err_cmin  = image_err;
                best_indx = k2;
            }
        }

        index_best[k1] = best_indx;
        err_requant += err_cmin;
    }

    return err_requant;
}

CGV_FLOAT cmp_mode6_optimize_IndexAndEndPoints(CMP_INOUT CGU_Vec4ui epo_code_out[2],  //
                                               CMP_INOUT CGU_UINT32 index_io[16],     // Make sure input index is 0..15 range
                                               CMP_IN CGU_Vec4ui image_src[16],
                                               CMP_IN CGU_UINT32 numEntries,          // max 16
                                               CMP_IN CGU_UINT32 Mi_,                 // last cluster , This should be no larger than 16
                                               CMP_IN CGU_UINT32 bits,                // total for all components
                                               CMP_IN CGU_UINT32 channels3or4,        // IN: 3 = RGB or 4 = RGBA (4 = MAX_CHANNELS)
                                               CMP_IN CGU_FLOAT errorThreshold) 
{
    CMP_UNUSED(bits);
    CGV_FLOAT  err_best    = CMP_FLOAT_MAX;
    CGU_UINT32 type        = 2;             // = bits %  (2 * channels3or4) for Mode 6 with 58 bits and 4 channels type is 2
    CGU_UINT32 use_par     = 1;             // as type == 2 use par is 1 = (type != 0);
    CGU_UINT32 max_bits[4] = {8, 8, 8, 8};  // Mode 6 max bits is 8 = (bits + channels2 - 1) / channels2;
    CGU_UINT32 index_bits  = 4;             // channel bits !! = 4
                                            // CGU_INT   iv;
                                            // iv = Mi_;
                                            // while (iv >>= 1)
                                            //     index_bits++;


    Mi_ = Mi_ - 1;

    CGU_UINT32 index_tmp[16];
    CGU_UINT32 maxTry      = MAX_TRY_SHAKER;  // should be set by quality
    CGV_FLOAT  err_requant = 0.0F;

    // Init best index to input index
    for (CGU_UINT32 k = 0; k < numEntries; k++)
        index_tmp[k] = index_io[k];

    CGU_UINT32 MaxIndex;

    MaxIndex = index_collapse2(index_tmp, numEntries);

    // we have a solid color 4x4 block no need for optimization!
    if (MaxIndex == 0)
        return 0.0f;

    for (CGU_UINT32 ii = 0; ii < maxTry; ii++)
    {
        //===============================
        // We have ramp colors to process
        //===============================
        CGV_FLOAT  err_cluster = CMP_FLOAT_MAX;
        CGV_FLOAT  err_shake;
        CGU_UINT32 index_cluster[16];
        CGU_Vec4ui epo_code_best[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};

        for (CGU_UINT32 ii2 = 0; ii2 < numEntries; ii2++)
            index_cluster[ii2] = 0;

        CGU_UINT32 mi = Mi_;

        for (CGU_UINT32 index_slope = 1; (index_slope * MaxIndex) <= mi; index_slope++)
        {
            CGV_Vec4f image_cluster[2] = {{0.0f, 0.0f, 0.0f, 0.0f}, {0.0f, 0.0f, 0.0f, 0.0f}};

            for (CGU_UINT32 index_offset = 0; index_offset <= (mi - index_slope * MaxIndex); index_offset++)
            {
                //-------------------------------------
                // set a new index data to try
                //-------------------------------------
                for (CGU_UINT32 k = 0; k < numEntries; k++)
                    index_cluster[k] = index_tmp[k] * index_slope + index_offset;

                if (get_ideal_cluster2(image_cluster, index_cluster, Mi_, image_src, numEntries, channels3or4))
                {
                    CGU_Vec4ui epo_code_shake[2] = {{0, 0, 0, 0}, {0, 0, 0, 0}};
                    err_shake                    = shake2(  epo_code_shake,  // return new epo
                                                            image_cluster,
                                                            index_cluster,
                                                            image_src,
                                                            index_bits,
                                                            type,
                                                            max_bits,
                                                            use_par,
                                                            numEntries,  // max 16
                                                            channels3or4);

                    if (err_shake < err_cluster)
                    {
                        err_cluster      = err_shake;
                        epo_code_best[0] = epo_code_shake[0];
                        epo_code_best[1] = epo_code_shake[1];
                    }
                }
            }
        }

        if ((err_cluster != CMP_FLOAT_MAX))
        {
            //=========================
            // test results for quality
            //=========================
            CGU_UINT32 index_best[16] = {0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0};
            err_requant               = requantized_image_err2( index_best,     // new index results
                                                                epo_code_best,  // prior result input
                                                                index_bits,
                                                                max_bits,
                                                                image_src,
                                                                numEntries,
                                                                channels3or4);
            if (err_requant < err_best)
            {
                //better = 1;
                for (CGU_UINT32 k = 0; k < numEntries; k++)
                    index_io[k] = index_tmp[k] = index_best[k];

                //cmp_pack4bitindex(index_packed_out, index_io);
                epo_code_out[0] = epo_code_best[0];
                epo_code_out[1] = epo_code_best[1];
                err_best        = err_requant;
            }
        }

        // Early out if we have our target err
        if (err_best <= errorThreshold)
            break;

        MaxIndex = index_collapse2(index_tmp, numEntries);
        if (MaxIndex == 0)
            break;
    }

    // Did not find anything better over Max trys
    return err_best;
}

#endif

#endif // ENABLE_CMP_API : CPU & GPU Code block

//=================================================================================
// GPU API Interfaces
// mode 4 5 6 all have 1 subset per block, and fix-up index is always index 0
//=================================================================================
CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode456CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID) 
{
    CMP_CONSTANT     CGU_UINT32 MAX_USED_THREAD = 16;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

#if (defined(ENABLE_MODE4) || defined(ENABLE_MODE5) || defined(ENABLE_MODE6)|| defined(ENABLE_CMP_MODE6))

    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;

        shared_temp[GI].endPoint_low  = shared_temp[GI].pixel;
        shared_temp[GI].endPoint_high = shared_temp[GI].pixel;
    }
    GroupSync();

    if (threadInBlock < 8)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
    }
    GroupSync();

    if (threadInBlock < 1)
    {
        shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
        shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
    }
    GroupSync();

    CGU_Vec4ui endPoint[2];
    endPoint[0] = shared_temp[threadBase].endPoint_low;
    endPoint[1] = shared_temp[threadBase].endPoint_high;

    CGU_UINT32 error = 0xFFFFFFFF;
    CGU_UINT32 mode = 0;
    CGU_UINT32 index_selector = 0;
    CGU_UINT32 rotation = 0;

    CGU_Vec2ui indexPrec;
    if (threadInBlock < 8) // all threads of threadInBlock < 8 will be working on trying out mode 4, since only mode 4 has index selector bit
    {
        if (0 == (threadInBlock & 1)) // thread 0, 2, 4, 6
        {
            //2 represents 2bit index precision; 1 represents 3bit index precision
            index_selector = 0;
            indexPrec = CGU_Vec2ui( 2, 1 );
        }
        else                          // thread 1, 3, 5, 7
        {
            //2 represents 2bit index precision; 1 represents 3bit index precision
            index_selector = 1;
            indexPrec = CGU_Vec2ui( 1, 2 );
        }
    }
    else
    {
         //2 represents 2bit index precision
        indexPrec = CGU_Vec2ui( 2, 2 );
    }

    CGU_Vec4ui pixel_r;
    CGU_UINT32 color_index;
    CGU_UINT32 alpha_index;
    CGU_Vec4i span;
    CGU_Vec2i span_norm_sqr;
    CGU_Vec2i dotProduct;

#if defined(ENABLE_MODE4) || defined(ENABLE_MODE5)
    if (threadInBlock < 12) // Try mode 4 5 in threads 0..11
    {
        CGU_Vec4ui ep_quantized[2];
        // mode 4 5 have component rotation
        if ((threadInBlock < 2) || (8 == threadInBlock))       // rotation = 0 in thread 0, 1
        {
            rotation = 0;
        }
        else if ((threadInBlock < 4) || (9 == threadInBlock))  // rotation = 1 in thread 2, 3
        {
            rotation = 1;
            set_pixel_rotation(endPoint[0],rotation);
            set_pixel_rotation(endPoint[1],rotation);
        }
        else if ((threadInBlock < 6) || (10 == threadInBlock)) // rotation = 2 in thread 4, 5
        {
            rotation = 2;
            set_pixel_rotation(endPoint[0],rotation);
            set_pixel_rotation(endPoint[1],rotation);
        }
        else if ((threadInBlock < 8) || (11 == threadInBlock)) // rotation = 3 in thread 6, 7
        {
            rotation = 3;
            set_pixel_rotation(endPoint[0],rotation);
            set_pixel_rotation(endPoint[1],rotation);
        }

        if (threadInBlock < 8)  // try mode 4 in threads 0..7
        {
            // mode 4 thread distribution
            // Thread           0   1   2   3   4   5   6   7
            // Rotation         0   0   1   1   2   2   3   3
            // Index selector   0   1   0   1   0   1   0   1

            mode = 4;
            compress_endpoints4( endPoint,ep_quantized );
        }
        else                    // try mode 5 in threads 8..11
        {
            // mode 5 thread distribution
            // Thread    8  9  10  11
            // Rotation  0  1   2   3

            mode = 5;
            compress_endpoints5( endPoint,ep_quantized );
        }

        CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;
        set_pixel_rotation(pixel,rotation);

        span = cmp_castimp(endPoint[1] - endPoint[0]);
        span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ), span.a * span.a );

        // should be the same as above
        CGU_Vec3ui diff0 = pixel.rgb - endPoint[0].rgb;
        CGU_Vec3ui diff1 = pixel.rgb - endPoint[1].rgb;
        dotProduct = CGU_Vec2i( dot( diff0, diff0), dot( diff1, diff1) );

        if ( dotProduct.x > dotProduct.y )
        {
            span.rgb.x = -span.rgb.x;
            span.rgb.y = -span.rgb.y;
            span.rgb.z = -span.rgb.z;
            swap(endPoint[0].rgb, endPoint[1].rgb);
        }

        CGU_UINT32 diffa0 = pixel.a - endPoint[0].a;
        CGU_UINT32 diffa1 = pixel.a - endPoint[1].a;
        dotProduct = CGU_Vec2i( dot( diffa0, diffa0 ), dot( diffa1,diffa1 ) );
        if ( dotProduct.x > dotProduct.y )
        {
            span.a = -span.a;
            swap(endPoint[0].a, endPoint[1].a);
        }

        error = 0;
        for ( CGU_UINT32 i = 0; i < 16; i ++ )
        {
            pixel = shared_temp[threadBase + i].pixel;
            set_pixel_rotation(pixel,rotation);

            diff0 = pixel.rgb - endPoint[0].rgb;

            dotProduct.x = dot( span.rgb, diff0 );
            color_index = ( span_norm_sqr.x <= 0 /*endPoint[0] == endPoint[1]*/ || dotProduct.x <= 0 /*pixel == endPoint[0]*/ ) ? 0
                : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );

            diffa0 = pixel.a - endPoint[0].a;
            dotProduct.y = dot( span.a, diffa0 );
            alpha_index = ( span_norm_sqr.y <= 0 || dotProduct.y <= 0 ) ? 0
                : ( ( dotProduct.y < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct.y * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );

            pixel_r.rgb = ( endPoint[0].rgb * ( 64 - aWeight[indexPrec.x][color_index] ) + endPoint[1].rgb * aWeight[indexPrec.x][color_index] + 32U );

            pixel_r.rgb.x =  pixel_r.rgb.x >> 6;
            pixel_r.rgb.y =  pixel_r.rgb.y >> 6;
            pixel_r.rgb.z =  pixel_r.rgb.z >> 6;

            pixel_r.a   = ( endPoint[0].a   * ( 64 - aWeight[indexPrec.y][alpha_index] ) + endPoint[1].a   * aWeight[indexPrec.y][alpha_index] + 32 ) >> 6;

            Ensure_A_Is_Larger( pixel_r, pixel );

            pixel_r -= pixel;
            set_pixel_rotation(pixel_r,rotation);
            error += ComputeError(pixel_r, pixel_r);
        }
    }
    else 
#endif
#ifdef ENABLE_MODE6
    if (threadInBlock < 16)// Try mode 6 in threads 12..15, since in mode 4 5 6, only mode 6 has p bit
    {
        CGU_UINT32 p = threadInBlock - 12;
        CGU_Vec4ui ep_quantized[2];

        compress_endpoints6( endPoint,ep_quantized, CGU_Vec2ui(p & 1 , (p >> 1)& 1 ) );

        CGU_Vec4ui pixel = shared_temp[threadBase + 0].pixel;

        span = cmp_castimp( endPoint[1] - endPoint[0] );
        span_norm_sqr = dot( span, span );

        CGU_Vec4ui diff4 = pixel - endPoint[0];
        dotProduct = dot( span, diff4 );
        if ( span_norm_sqr.x > 0 && dotProduct.x >= 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) )
        {
            span = -span;
            swap(endPoint[0], endPoint[1]);
        }
            
        error = 0;
        for ( CGU_UINT32 i = 0; i < 16; i ++ )
        {
            pixel = shared_temp[threadBase + i].pixel;
            diff4 = pixel - endPoint[0];
            dotProduct.x = dot( span, diff4 );
            color_index = ( span_norm_sqr.x <= 0 || dotProduct.x <= 0 ) ? 0
                : ( ( dotProduct.x < span_norm_sqr.x ) ? aStep[0][ CGU_UINT32( dotProduct.x * 63.49999 / span_norm_sqr.x ) ] : aStep[0][63] );
            
            pixel_r = (  endPoint[0] * ( 64 - aWeight[0][color_index] ) + 
                         endPoint[1] * aWeight[0][color_index]  + 32U ) >> 6;
        
            Ensure_A_Is_Larger( pixel_r, pixel );
            pixel_r -= pixel;
            error += ComputeError(pixel_r, pixel_r);
        }

        mode = 6;
        rotation = p;    // Borrow rotation for p
    }
#endif

    shared_temp[GI].error          = error;
    shared_temp[GI].mode           = mode;
    shared_temp[GI].index_selector = index_selector;
    shared_temp[GI].rotation       = rotation;
    GroupSync();

    if (threadInBlock < 8)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
        {
            shared_temp[GI].error           = shared_temp[GI + 8].error;
            shared_temp[GI].mode            = shared_temp[GI + 8].mode;
            shared_temp[GI].index_selector  = shared_temp[GI + 8].index_selector;
            shared_temp[GI].rotation        = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
        {
            shared_temp[GI].error = shared_temp[GI + 4].error;
            shared_temp[GI].mode = shared_temp[GI + 4].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 4].index_selector;
            shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
        {
            shared_temp[GI].error = shared_temp[GI + 2].error;
            shared_temp[GI].mode = shared_temp[GI + 2].mode;
            shared_temp[GI].index_selector = shared_temp[GI + 2].index_selector;
            shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();


    if (threadInBlock < 1)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
        {
            shared_temp[GI].error           = shared_temp[GI + 1].error;
            shared_temp[GI].mode            = shared_temp[GI + 1].mode;
            shared_temp[GI].index_selector  = shared_temp[GI + 1].index_selector;
            shared_temp[GI].rotation        = shared_temp[GI + 1].rotation;
        }

        // Save the fast mode settings for modes 4&5 check if q = 0 for mode 6)
        g_OutBuff1[blockID].error           = shared_temp[GI].error;
        g_OutBuff1[blockID].mode            = shared_temp[GI].mode & 0x07;
        g_OutBuff1[blockID].rotation        = shared_temp[GI].rotation;
        g_OutBuff1[blockID].index_selector  = shared_temp[GI].index_selector;
        g_OutBuff1[blockID].partition       = 0;
        g_OutBuff1[blockID].data2           = 0;


       // Enable cmp test
#ifdef ENABLE_CMP_MODE6
       if ((g_quality > 0.05f)
#ifdef ENABLE_MODE6
           && (shared_temp[GI].mode == 6)
#endif
           )
       {
            CGU_Vec4ui image_src[16];
            for (int i = 0; i < 16; i++)
            {
                image_src[i].x = shared_temp[threadBase + i].pixel.x;
                image_src[i].y = shared_temp[threadBase + i].pixel.y;
                image_src[i].z = shared_temp[threadBase + i].pixel.z;
                image_src[i].w = shared_temp[threadBase + i].pixel.w;
            }

            CGU_Vec4ui epo_code_out[2]     = {{0, 0, 0, 0}, {0, 0, 0, 0}};
            CGU_UINT32 index_packed_out[2] = {0, 0};
            CGU_UINT32 cmp_out6[4]         = {0, 0, 0, 0};
            CGU_UINT32 best_index_out[16];

            CGU_UINT32 besterr = cmp_GetIndexedEndPoints(epo_code_out,
                                                         best_index_out,
                                                         image_src,
                                                         15,          // numEntries 0..15 (Note this function is changed from using 16)
                                                         0xffffffff);

            // Error cal needs updating to be the same all over
            //if (besterr > shared_temp[GI].error)
            {
                cmp_pack4bitindex32(index_packed_out, best_index_out);

#ifdef ENABLE_CMP_REFINE_MODE6_API
            if (g_quality > 0.5f)
            {
                // Refined for better quailty using prior best_index_out initial input
                besterr = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
                                                     best_index_out,
                                                     image_src,
                                                     16,                              // numEntries
                                                     g_modesettings[6].clusters,      // 16,
                                                     g_modesettings[6].bits,          // 58,
                                                     g_modesettings[6].channels3or4,  // 4,
                                                     0.1f);

                cmp_pack4bitindex32(index_packed_out, best_index_out);
            }
#endif

                cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);

                // Addin CMP results
                g_OutBuff1[blockID].error   = besterr;
                g_OutBuff1[blockID].mode    = 6 | 0x10;
                g_OutBuff1[blockID].data2.x = cmp_out6[0];
                g_OutBuff1[blockID].data2.y = cmp_out6[1];
                g_OutBuff1[blockID].data2.z = cmp_out6[2];
                g_OutBuff1[blockID].data2.w = cmp_out6[3];

            } // if better then fast mode
        }
#endif
    }

#else
    // Init
    if (threadInBlock < 1) {
        g_OutBuff1[blockID].error           = MAX_UINT;
        g_OutBuff1[blockID].mode            = 0;
        g_OutBuff1[blockID].rotation        = 0;
        g_OutBuff1[blockID].index_selector  = 0;
        g_OutBuff1[blockID].partition       = 0;
        g_OutBuff1[blockID].data2           = 0;
    }
    GroupSync();
#endif

}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode137CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)  // mode 1 3 7 all have 2 subsets per block
{
    const CGU_UINT32 MAX_USED_THREAD = 64;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
    }
    GroupSync();

    shared_temp[GI].error = 0xFFFFFFFF;

    // Use this to test only one of modes 1,3, or 7
    // if (g_mode_id != 7) {
    //     if (threadInBlock == 0)
    //         g_OutBuff1[blockID].error           = g_InBuff[blockID].error;
    //         g_OutBuff1[blockID].mode            = g_InBuff[blockID].mode;
    //         g_OutBuff1[blockID].partition       = g_InBuff[blockID].partition;
    //         g_OutBuff1[blockID].index_selector  = g_InBuff[blockID].index_selector;
    //         g_OutBuff1[blockID].rotation        = g_InBuff[blockID].rotation;
    //         g_OutBuff1[blockID].data2           = g_InBuff[blockID].data2;
    //      return;
    // }

#if defined(ENABLE_MODE1) || defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
    CGU_Vec4ui pixel_r;
    CGU_Vec4ui endPoint[2][2];  // endPoint[0..1 for subset id][0..1 for low and high in the subset]
    CGU_Vec4ui endPointBackup[2][2];
    CGU_UINT32 color_index;
    if (threadInBlock < 64)
    {
        CGU_UINT32 partition = threadInBlock;
        CGU_UINT32 i;

        endPoint[0][0]  = MAX_UINT;
        endPoint[0][1]  = MIN_UINT;
        endPoint[1][0]  = MAX_UINT;
        endPoint[1][1]  = MIN_UINT;
        CGU_UINT32 bits = blockPartitions[partition];
        for (i = 0; i < 16; i++)
        {
            CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
            if (((bits >> i) & 0x01) == 1)
            {
                endPoint[1][0] = cmp_min(endPoint[1][0], pixel);
                endPoint[1][1] = cmp_max(endPoint[1][1], pixel);
            }
            else
            {
                endPoint[0][0] = cmp_min(endPoint[0][0], pixel);
                endPoint[0][1] = cmp_max(endPoint[0][1], pixel);
            }
        }

        endPointBackup[0][0] = endPoint[0][0];
        endPointBackup[0][1] = endPoint[0][1];
        endPointBackup[1][0] = endPoint[1][0];
        endPointBackup[1][1] = endPoint[1][1];

        CGU_UINT32 max_p = 2;  // mode 1

#if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
        if (g_mode_id != 1)
        {
            // in mode 3 7, there are two p bits per subset, one for each end point
            max_p = 4;
        }
#endif
        CGU_UINT32 final_p[2] = {0, 0};
        CGU_UINT32 error[2]   = {MAX_UINT, MAX_UINT};
        for (CGU_UINT32 p = 0; p < max_p; p++)
        {
            endPoint[0][0] = endPointBackup[0][0];
            endPoint[0][1] = endPointBackup[0][1];
            endPoint[1][0] = endPointBackup[1][0];
            endPoint[1][1] = endPointBackup[1][1];

            for (i = 0; i < 2; i++)  // loop through 2 subsets
            {
#if defined(ENABLE_MODE1)
                if (g_mode_id == 1)
                {
                    CGU_Vec4ui quantized[2];

                    compress_endpoints1(endPoint[i], quantized, p);
                }
#endif
#if defined(ENABLE_MODE3)
                if (g_mode_id == 3)
                {
                    CGU_Vec4ui quantized[2];

                    compress_endpoints3(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
                }
#endif
#if defined(ENABLE_MODE7)
                if (g_mode_id == 7)
                {
                    CGU_Vec4ui quantized[2];
                    compress_endpoints7(endPoint[i], quantized, CGU_Vec2ui(p & 1, (p >> 1) & 1));
                }
#endif
            }

            CGU_Vec4i span[2];
            span[0].x = endPoint[0][1].x - endPoint[0][0].x;
            span[0].y = endPoint[0][1].y - endPoint[0][0].y;
            span[0].z = endPoint[0][1].z - endPoint[0][0].z;
            span[0].w = endPoint[0][1].w - endPoint[0][0].w;
            span[1].x = endPoint[1][1].x - endPoint[1][0].x;
            span[1].y = endPoint[1][1].y - endPoint[1][0].y;
            span[1].z = endPoint[1][1].z - endPoint[1][0].z;
            span[1].w = endPoint[1][1].w - endPoint[1][0].w;

#if defined(ENABLE_MODE3)
            if (g_mode_id != 7)
            {
                span[0].w = span[1].w = 0;
            }
#endif
            CGU_INT span_norm_sqr[2];
            span_norm_sqr[0] = dot(span[0], span[0]);
            span_norm_sqr[1] = dot(span[1], span[1]);

            CGU_Vec4i diff;
            diff.x = shared_temp[threadBase + 0].pixel.x - endPoint[0][0].x;
            diff.y = shared_temp[threadBase + 0].pixel.y - endPoint[0][0].y;
            diff.z = shared_temp[threadBase + 0].pixel.z - endPoint[0][0].z;
            diff.w = shared_temp[threadBase + 0].pixel.w - endPoint[0][0].w;

            // TODO: again, this shouldn't be necessary here in error calculation
            CGU_INT dotProduct = dot(span[0],diff);
            if (span_norm_sqr[0] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[0]))
            {
                span[0].x = -span[0].x;
                span[0].y = -span[0].y;
                span[0].z = -span[0].z;
                span[0].w = -span[0].w;
                swap(endPoint[0][0], endPoint[0][1]);
            }

            diff.x = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.x - endPoint[1][0].x;
            diff.y = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.y - endPoint[1][0].y;
            diff.z = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.z - endPoint[1][0].z;
            diff.w = shared_temp[threadBase + candidateFixUpIndex1D[partition].x].pixel.w - endPoint[1][0].w;

            dotProduct = dot(span[1], diff);
            if (span_norm_sqr[1] > 0 && dotProduct > 0 && CGU_UINT32(dotProduct * 63.49999) > CGU_UINT32(32 * span_norm_sqr[1]))
            {
                span[1].x = -span[1].x;
                span[1].y = -span[1].y;
                span[1].z = -span[1].z;
                span[1].w = -span[1].w;
                swap(endPoint[1][0], endPoint[1][1]);
            }

            CGU_UINT32 step_selector = 1;  // mode 1 has 3 bit index

#if defined(ENABLE_MODE3) || defined(ENABLE_MODE7)
            if (g_mode_id != 1)
            {
                step_selector = 2;  // mode 3 7 have 2 bit index
            }
#endif

            CGU_UINT32 p_error[2] = {0, 0};
            for (i = 0; i < 16; i++)
            {
                CGU_UINT32 subset_index = (bits >> i) & 0x01;

                if (subset_index == 1)
                {
                    diff.x = shared_temp[threadBase + i].pixel.x - endPoint[1][0].x;
                    diff.y = shared_temp[threadBase + i].pixel.y - endPoint[1][0].y;
                    diff.z = shared_temp[threadBase + i].pixel.z - endPoint[1][0].z;
                    diff.w = shared_temp[threadBase + i].pixel.w - endPoint[1][0].w;

                    dotProduct  = dot(span[1], diff);
                    color_index = (span_norm_sqr[1] <= 0 || dotProduct <= 0)
                                      ? 0
                                      : ((dotProduct < span_norm_sqr[1]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[1])]
                                                                         : aStep[step_selector][63]);
                }
                else
                {
                    diff.x      = shared_temp[threadBase + i].pixel.x - endPoint[0][0].x;
                    diff.y      = shared_temp[threadBase + i].pixel.y - endPoint[0][0].y;
                    diff.z      = shared_temp[threadBase + i].pixel.z - endPoint[0][0].z;
                    diff.w      = shared_temp[threadBase + i].pixel.w - endPoint[0][0].w;
                    dotProduct  = dot(span[0], diff);
                    color_index = (span_norm_sqr[0] <= 0 || dotProduct <= 0)
                                      ? 0
                                      : ((dotProduct < span_norm_sqr[0]) ? aStep[step_selector][CGU_UINT32(dotProduct * 63.49999 / span_norm_sqr[0])]
                                                                         : aStep[step_selector][63]);
                }

                pixel_r = (endPoint[subset_index][0] * (64 - aWeight[step_selector][color_index]) +
                           endPoint[subset_index][1] * aWeight[step_selector][color_index] + 32U) >>
                          6;
                if (g_mode_id != 7)
                {
                    pixel_r.a = 255;
                }

                CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
                Ensure_A_Is_Larger(pixel_r, pixel);
                pixel_r -= pixel;
                CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);
                if (subset_index == 1)
                    p_error[1] += pixel_error;
                else
                    p_error[0] += pixel_error;
            }

            for (i = 0; i < 2; i++)
            {
                if (p_error[i] < error[i])
                {
                    error[i]   = p_error[i];
                    final_p[i] = p;
                }
            }
        }

        shared_temp[GI].error     = error[0] + error[1];
        shared_temp[GI].mode      = g_mode_id;
        shared_temp[GI].partition = partition;

        // mode 1 3 7 don't have rotation, we use rotation for p bits
        if (g_mode_id == 1)
            shared_temp[GI].rotation = (final_p[1] << 1) | final_p[0];
        else
            shared_temp[GI].rotation = (final_p[1] << 2) | final_p[0];
    }
    GroupSync();

    if (threadInBlock < 32)
    {
        if (shared_temp[GI].error > shared_temp[GI + 32].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 32].error;
            shared_temp[GI].mode      = shared_temp[GI + 32].mode;
            shared_temp[GI].partition = shared_temp[GI + 32].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 32].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 16)
    {
        if (shared_temp[GI].error > shared_temp[GI + 16].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 16].error;
            shared_temp[GI].mode      = shared_temp[GI + 16].mode;
            shared_temp[GI].partition = shared_temp[GI + 16].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 16].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 8)
    {
        if (shared_temp[GI].error > shared_temp[GI + 8].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 8].error;
            shared_temp[GI].mode      = shared_temp[GI + 8].mode;
            shared_temp[GI].partition = shared_temp[GI + 8].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 4)
    {
        if (shared_temp[GI].error > shared_temp[GI + 4].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 4].error;
            shared_temp[GI].mode      = shared_temp[GI + 4].mode;
            shared_temp[GI].partition = shared_temp[GI + 4].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 2)
    {
        if (shared_temp[GI].error > shared_temp[GI + 2].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 2].error;
            shared_temp[GI].mode      = shared_temp[GI + 2].mode;
            shared_temp[GI].partition = shared_temp[GI + 2].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();
    if (threadInBlock < 1)
    {
        if (shared_temp[GI].error > shared_temp[GI + 1].error)
        {
            shared_temp[GI].error     = shared_temp[GI + 1].error;
            shared_temp[GI].mode      = shared_temp[GI + 1].mode;
            shared_temp[GI].partition = shared_temp[GI + 1].partition;
            shared_temp[GI].rotation  = shared_temp[GI + 1].rotation;
        }

        if ((g_InBuff[blockID].error > shared_temp[GI].error)){
            g_OutBuff1[blockID].error           = shared_temp[GI].error;
            g_OutBuff1[blockID].mode            = shared_temp[GI].mode;
            g_OutBuff1[blockID].partition       = shared_temp[GI].partition;
            g_OutBuff1[blockID].rotation        = shared_temp[GI].rotation;
            g_OutBuff1[blockID].index_selector  = 0;
            g_OutBuff1[blockID].data2           = 0;
        }
        else
        {
            g_OutBuff1[blockID].error           = g_InBuff[blockID].error;
            g_OutBuff1[blockID].mode            = g_InBuff[blockID].mode;
            g_OutBuff1[blockID].partition       = g_InBuff[blockID].partition;
            g_OutBuff1[blockID].index_selector  = g_InBuff[blockID].index_selector;
            g_OutBuff1[blockID].rotation        = g_InBuff[blockID].rotation;
            g_OutBuff1[blockID].data2           = g_InBuff[blockID].data2;
        }
    }
#else
    GroupSync();
    if (threadInBlock < 1)
    {
        // cary over prior results
            g_OutBuff1[blockID].error           = g_InBuff[blockID].error;
            g_OutBuff1[blockID].mode            = g_InBuff[blockID].mode;
            g_OutBuff1[blockID].partition       = g_InBuff[blockID].partition;
            g_OutBuff1[blockID].index_selector  = g_InBuff[blockID].index_selector;
            g_OutBuff1[blockID].rotation        = g_InBuff[blockID].rotation;
            g_OutBuff1[blockID].data2           = g_InBuff[blockID].data2;
    }
#endif
}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void TryMode02CS(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)  // mode 0 2 have 3 subsets per block
{
    const CGU_UINT32 MAX_USED_THREAD = 64;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

#if defined(ENABLE_MODE0) || defined(ENABLE_MODE2)
    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);
        shared_temp[GI].pixel.r = (CGU_UINT32)px.r;
        shared_temp[GI].pixel.g = (CGU_UINT32)px.g;
        shared_temp[GI].pixel.b = (CGU_UINT32)px.b;
        shared_temp[GI].pixel.a = (CGU_UINT32)px.a;
    }
    GroupSync();

    shared_temp[GI].error = 0xFFFFFFFF;

    CGU_UINT32 num_partitions;
    if (0 == g_mode_id)
    {
        num_partitions = 16;
    }
    else
    {
        num_partitions = 64;
    }

    CGU_Vec4ui pixel_r;
    CGU_Vec4ui endPoint[3][2];        // endPoint[0..1 for subset id][0..1 for low and high in the subset]
    CGU_Vec4ui endPointBackup[3][2];
    CGU_UINT32 color_index[16];

    if (threadInBlock < num_partitions)
    {
        CGU_UINT32 partition = threadInBlock + 64;

        endPoint[0][0] = MAX_UINT;
        endPoint[0][1] = MIN_UINT;
        endPoint[1][0] = MAX_UINT;
        endPoint[1][1] = MIN_UINT;
        endPoint[2][0] = MAX_UINT;
        endPoint[2][1] = MIN_UINT;
        CGU_UINT32 bits2 = blockPartitions2[partition - 64];
        CGU_UINT32 i;
        for ( i = 0; i < 16; i ++ )
        {
            CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;
            CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
            if ( subset_index == 2 )
            {
                endPoint[2][0] = cmp_min( endPoint[2][0], pixel );
                endPoint[2][1] = cmp_max( endPoint[2][1], pixel );
            }
            else if ( subset_index == 1 )
            {
                endPoint[1][0] = cmp_min( endPoint[1][0], pixel );
                endPoint[1][1] = cmp_max( endPoint[1][1], pixel );
            }
            else
            {
                endPoint[0][0] = cmp_min( endPoint[0][0], pixel );
                endPoint[0][1] = cmp_max( endPoint[0][1], pixel );
            }
        }

        endPointBackup[0][0] = endPoint[0][0];
        endPointBackup[0][1] = endPoint[0][1];
        endPointBackup[1][0] = endPoint[1][0];
        endPointBackup[1][1] = endPoint[1][1];
        endPointBackup[2][0] = endPoint[2][0];
        endPointBackup[2][1] = endPoint[2][1];

        CGU_UINT32 max_p;
        if (0 == g_mode_id)
        {
            max_p = 4;
        }
        else
        {
            max_p = 1;
        }

        CGU_UINT32 final_p[3] = { 0, 0, 0 };
        CGU_UINT32 error[3] = { MAX_UINT, MAX_UINT, MAX_UINT };
        CGU_Vec4ui ep_quantized[2];
        for ( CGU_UINT32 p = 0; p < max_p; p ++ )
        {
            endPoint[0][0] = endPointBackup[0][0];
            endPoint[0][1] = endPointBackup[0][1];
            endPoint[1][0] = endPointBackup[1][0];
            endPoint[1][1] = endPointBackup[1][1];
            endPoint[2][0] = endPointBackup[2][0];
            endPoint[2][1] = endPointBackup[2][1];

            for ( i = 0; i < 3; i ++ )
            {
                if (0 == g_mode_id)
                {
                    compress_endpoints0( endPoint[i],ep_quantized, CGU_Vec2ui(p& 1, (p >> 1)& 1));
                }
                else
                {
                    compress_endpoints2( endPoint[i],ep_quantized );
                }
            }

            CGU_UINT32 step_selector = 1 + (2 == g_mode_id);

            CGU_Vec4i span[3];
            span[0] = cmp_castimp(endPoint[0][1] - endPoint[0][0]);
            span[1] = cmp_castimp(endPoint[1][1] - endPoint[1][0]);
            span[2] = cmp_castimp(endPoint[2][1] - endPoint[2][0]);
            span[0].w = span[1].w = span[2].w = 0;

            CGU_INT span_norm_sqr[3];
            span_norm_sqr[0] = dot( span[0], span[0] );
            span_norm_sqr[1] = dot( span[1], span[1] );
            span_norm_sqr[2] = dot( span[2], span[2] );

            // TODO: again, this shouldn't be necessary here in error calculation
            CGU_UINT32 ci[3] = { 0, candidateFixUpIndex1D[partition].x, candidateFixUpIndex1D[partition].y };
            CGU_Vec4ui diff;
            for (i = 0; i < 3; i ++)
            {
                diff = shared_temp[threadBase + ci[i]].pixel - endPoint[i][0];
                CGU_INT dotProduct = dot( span[i], diff );
                if ( span_norm_sqr[i] > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr[i] ) )
                {
                    span[i] = -span[i];
                    swap(endPoint[i][0], endPoint[i][1]);
                }
            }

            CGU_UINT32 p_error[3] = { 0, 0, 0 };

            for ( i = 0; i < 16; i ++ )
            {
                CGU_UINT32 subset_index = ( bits2 >> ( i * 2 ) ) & 0x03;
                if ( subset_index == 2 )
                {
                    diff = shared_temp[threadBase + i].pixel - endPoint[2][0];
                    CGU_INT dotProduct = dot( span[2], diff );
                    color_index[i] = ( span_norm_sqr[2] <= 0 || dotProduct <= 0 ) ? 0
                        : ( ( dotProduct < span_norm_sqr[2] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[2] ) ] : aStep[step_selector][63] );
                }
                else if ( subset_index == 1 )
                {
                    diff =  shared_temp[threadBase + i].pixel - endPoint[1][0];
                    CGU_INT dotProduct = dot( span[1], diff );
                    color_index[i] = ( span_norm_sqr[1] <= 0 || dotProduct <= 0 ) ? 0
                        : ( ( dotProduct < span_norm_sqr[1] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[1] ) ] : aStep[step_selector][63] );
                }
                else
                {
                    diff = shared_temp[threadBase + i].pixel - endPoint[0][0];
                    CGU_INT dotProduct = dot( span[0], diff );
                    color_index[i] = ( span_norm_sqr[0] <= 0 || dotProduct <= 0 ) ? 0
                        : ( ( dotProduct < span_norm_sqr[0] ) ? aStep[step_selector][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr[0] ) ] : aStep[step_selector][63] );
                }

                pixel_r = ( endPoint[subset_index][0]*( 64 - aWeight[step_selector][color_index[i]] ) + 
                            endPoint[subset_index][1]* aWeight[step_selector][color_index[i]] + 32U ) >> 6;
                pixel_r.a = 255;

                CGU_Vec4ui pixel = shared_temp[threadBase + i].pixel;                
                Ensure_A_Is_Larger( pixel_r, pixel );
                pixel_r -= pixel;

                CGU_UINT32 pixel_error = ComputeError(pixel_r, pixel_r);

                if ( subset_index == 2 )
                    p_error[2] += pixel_error;
                else if ( subset_index == 1 )
                    p_error[1] += pixel_error;
                else
                    p_error[0] += pixel_error;
            }

            for ( i = 0; i < 3; i++ )
            {
                if (p_error[i] < error[i])
                {
                    error[i] = p_error[i];
                    final_p[i] = p;    // Borrow rotation for p
                }
            }
        }

        shared_temp[GI].error = error[0] + error[1] + error[2];
        shared_temp[GI].partition = partition;
        shared_temp[GI].rotation = (final_p[2] << 4) | (final_p[1] << 2) | final_p[0];
    }
    GroupSync();

    if (threadInBlock < 32)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 32].error )
        {
            shared_temp[GI].error = shared_temp[GI + 32].error;
            shared_temp[GI].partition = shared_temp[GI + 32].partition;
            shared_temp[GI].rotation = shared_temp[GI + 32].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 16)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 16].error )
        {
            shared_temp[GI].error = shared_temp[GI + 16].error;
            shared_temp[GI].partition = shared_temp[GI + 16].partition;
            shared_temp[GI].rotation = shared_temp[GI + 16].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 8)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 8].error )
        {
            shared_temp[GI].error = shared_temp[GI + 8].error;
            shared_temp[GI].partition = shared_temp[GI + 8].partition;
            shared_temp[GI].rotation = shared_temp[GI + 8].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 4)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 4].error )
        {
            shared_temp[GI].error = shared_temp[GI + 4].error;
            shared_temp[GI].partition = shared_temp[GI + 4].partition;
            shared_temp[GI].rotation = shared_temp[GI + 4].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 2)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 2].error )
        {
            shared_temp[GI].error = shared_temp[GI + 2].error;
            shared_temp[GI].partition = shared_temp[GI + 2].partition;
            shared_temp[GI].rotation = shared_temp[GI + 2].rotation;
        }
    }
    GroupSync();

    if (threadInBlock < 1)
    {
        if ( shared_temp[GI].error > shared_temp[GI + 1].error )
        {
            shared_temp[GI].error = shared_temp[GI + 1].error;
            shared_temp[GI].partition = shared_temp[GI + 1].partition;
            shared_temp[GI].rotation = shared_temp[GI + 1].rotation;
        }

        if (g_InBuff[blockID].error > shared_temp[GI].error)
        {
            g_OutBuff1[blockID].error       = shared_temp[GI].error;
            g_OutBuff1[blockID].mode        = g_mode_id;
            g_OutBuff1[blockID].partition   = shared_temp[GI].partition;
            g_OutBuff1[blockID].rotation    = shared_temp[GI].rotation;
            g_OutBuff1[blockID].data2       = 0;
        }
        else
        {
            g_OutBuff1[blockID].error           = g_InBuff[blockID].error;
            g_OutBuff1[blockID].mode            = g_InBuff[blockID].mode;
            g_OutBuff1[blockID].partition       = g_InBuff[blockID].partition;
            g_OutBuff1[blockID].index_selector  = g_InBuff[blockID].index_selector;
            g_OutBuff1[blockID].rotation        = g_InBuff[blockID].rotation;
            g_OutBuff1[blockID].data2           = g_InBuff[blockID].data2;
        }
    }
#endif
}

CMP_NUMTHREADS(THREAD_GROUP_SIZE, 1, 1) void EncodeBlocks(CGU_UINT32 GI CMP_SVGROUPINDEX, CGU_Vec3ui groupID CMP_SVGROUPID)
{
    CMP_CONSTANT CGU_UINT32 MAX_USED_THREAD = 16;
    CGU_UINT32       BLOCK_IN_GROUP  = THREAD_GROUP_SIZE / MAX_USED_THREAD;
    CGU_UINT32       blockInGroup    = GI / MAX_USED_THREAD;
    CGU_UINT32       blockID         = g_start_block_id + groupID.x * BLOCK_IN_GROUP + blockInGroup;
    CGU_UINT32       threadBase      = blockInGroup * MAX_USED_THREAD;
    CGU_UINT32       threadInBlock   = GI - threadBase;

    CGU_UINT32 block_y = blockID / g_num_block_x;
    CGU_UINT32 block_x = blockID - block_y * g_num_block_x;
    CGU_UINT32 base_x  = block_x * BLOCK_SIZE_X;
    CGU_UINT32 base_y  = block_y * BLOCK_SIZE_Y;

    CGU_UINT32 use_cmp             = g_InBuff[blockID].mode & 0x10;
    CGU_UINT32 best_mode           = g_InBuff[blockID].mode & 0x07;
    CGU_UINT32 best_partition      = g_InBuff[blockID].partition;
    CGU_UINT32 best_index_selector = g_InBuff[blockID].index_selector;
    CGU_UINT32 best_rotation       = g_InBuff[blockID].rotation;

    if (threadInBlock < 16)
    {
        CGU_Vec4f px            = g_Input.Load(CGU_Vec3ui(base_x + threadInBlock % 4, base_y + threadInBlock / 4, 0)) * 255.0f;
        px                      = clamp(px, 0.0f, 255.0f);

        CGU_Vec4ui pixel;
        pixel.r = (CGU_UINT32)px.r;
        pixel.g = (CGU_UINT32)px.g;
        pixel.b = (CGU_UINT32)px.b;
        pixel.a = (CGU_UINT32)px.a;

        if ((4 == best_mode) || (5 == best_mode))
            set_pixel_rotation(pixel,best_rotation);

        shared_temp[GI].pixel = pixel;
    }
    GroupSync();

    CGU_UINT32 bits = blockPartitions[best_partition];
    CGU_UINT32 bits2 = blockPartitions2[best_partition - 64];

    CGU_Vec4ui ep[2];
    ep[0] = MAX_UINT;
    ep[1] = MIN_UINT;

    CGU_Vec4ui ep_quantized[2];
    CGU_Vec3ui diff3;
    CGU_Vec4ui diff4;

    CMP_UNROLL for (CGU_INT ii = 2; ii >= 0; -- ii)
    {
        if (threadInBlock < 16)
        {
            CGU_Vec4ui epTemp[2];
            epTemp[0] = MAX_UINT;
            epTemp[1] = MIN_UINT;

            CGU_Vec4ui pixel = shared_temp[GI].pixel;

            CGU_UINT32 subset_index = ( bits >> threadInBlock ) & 0x01;
            CGU_UINT32 subset_index2 = ( bits2 >> ( threadInBlock * 2 ) ) & 0x03;
            if (0 == ii)
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (0 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
                {
                    if (0 == subset_index)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((4 == best_mode) || (5 == best_mode) || (6 == best_mode))
                {
                    epTemp[0] = epTemp[1] = pixel;
                }
            }
            else if (1 == ii)
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (1 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
                else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
                {
                    if (1 == subset_index)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
            }
            else
            {
                if ((0 == best_mode) || (2 == best_mode))
                {
                    if (2 == subset_index2)
                    {
                        epTemp[0] = epTemp[1] = pixel;
                    }
                }
            }

            shared_temp[GI].endPoint_low  = epTemp[0];
            shared_temp[GI].endPoint_high = epTemp[1];
        }
        GroupSync();

        if (threadInBlock < 8)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 8].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 8].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 4)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 4].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 4].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 2)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 2].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 2].endPoint_high);
        }
        GroupSync();

        if (threadInBlock < 1)
        {
            shared_temp[GI].endPoint_low  = cmp_min(shared_temp[GI].endPoint_low, shared_temp[GI + 1].endPoint_low);
            shared_temp[GI].endPoint_high = cmp_max(shared_temp[GI].endPoint_high, shared_temp[GI + 1].endPoint_high);
        }
        GroupSync();

        if (ii == (int)threadInBlock)
        {
            ep[0] = shared_temp[threadBase].endPoint_low;
            ep[1] = shared_temp[threadBase].endPoint_high;
        }
    }

    if (threadInBlock < 3)
    {

        CGU_Vec2ui P;

        if (1 == best_mode)
        {
            P = (best_rotation >> threadInBlock) & 1;
        }
        else
        {
            P = CGU_Vec2ui((best_rotation >> (threadInBlock * 2 + 0))&1, (best_rotation >> (threadInBlock * 2 + 1))&1);
        }

        if (0 == best_mode)
        {
            compress_endpoints0( ep,ep_quantized, P );
        }
        else if (1 == best_mode)
        {
            compress_endpoints1( ep,ep_quantized, P );
        }
        else if (2 == best_mode)
        {
            compress_endpoints2( ep,ep_quantized );
        }
        else if (3 == best_mode)
        {
            compress_endpoints3( ep,ep_quantized, P );
        }
        else if (4 == best_mode)
        {
            compress_endpoints4( ep,ep_quantized );
        }
        else if (5 == best_mode)
        {
            compress_endpoints5( ep,ep_quantized);
        }
        else if (6 == best_mode)
        {
            compress_endpoints6( ep,ep_quantized, P );
        }
        else //if (7 == mode)
        {
            compress_endpoints7( ep,ep_quantized, P );
        }

        CGU_Vec4i span = cmp_castimp(ep[1] - ep[0]);

        if (best_mode < 4)
            span.w = 0;

        if ((4 == best_mode) || (5 == best_mode))
        {
            if (0 == threadInBlock)
            {
                CGU_Vec2i span_norm_sqr = CGU_Vec2i( dot( span.rgb, span.rgb ),span.a * span.a );
                
                diff3 = shared_temp[threadBase + 0].pixel.rgb - ep[0].rgb;
                CGU_Vec2i dotProduct = CGU_Vec2i( dot( span.rgb, diff3 ), span.a * ( shared_temp[threadBase + 0].pixel.a - ep[0].a ) );
                if ( span_norm_sqr.x > 0 && dotProduct.x > 0 && CGU_UINT32( dotProduct.x * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.x ) )
                {
                    swap(ep[0].rgb, ep[1].rgb);
                    swap(ep_quantized[0].rgb, ep_quantized[1].rgb);
                }
                if ( span_norm_sqr.y > 0 && dotProduct.y > 0 && CGU_UINT32( dotProduct.y * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr.y ) )
                {
                    swap(ep[0].a, ep[1].a);
                    swap(ep_quantized[0].a, ep_quantized[1].a);
                }
            }
        }
        else //if ((0 == mode) || (2 == mode) || (1 == mode) || (3 == mode) || (7 == mode) || (6 == mode))
        {
            CGU_INT p;
            if (0 == threadInBlock)
            {
                p = 0;
            }
            else if (1 == threadInBlock)
            {
                p = candidateFixUpIndex1D[best_partition].x;
            }
            else //if (2 == threadInBlock)
            {
                p = candidateFixUpIndex1D[best_partition].y;
            }

            CGU_INT span_norm_sqr = dot( span, span );
            diff4 = shared_temp[threadBase + p].pixel - ep[0];
            CGU_INT dotProduct = dot( span, diff4 );
            if ( span_norm_sqr > 0 && dotProduct > 0 && CGU_UINT32( dotProduct * 63.49999 ) > CGU_UINT32( 32 * span_norm_sqr ) )
            {
                swap(ep[0], ep[1]);
                swap(ep_quantized[0], ep_quantized[1]);		
            }
        }

        shared_temp[GI].endPoint_low = ep[0];
        shared_temp[GI].endPoint_high = ep[1];
        shared_temp[GI].endPoint_low_quantized = ep_quantized[0];
        shared_temp[GI].endPoint_high_quantized = ep_quantized[1];
    }
    GroupSync();


    if (threadInBlock < 16)
    {
        CGU_UINT32 color_index = 0;
        CGU_UINT32 alpha_index = 0;

        CGU_Vec4ui epTemp[2];

        CGU_Vec2ui indexPrec;
        if ((0 == best_mode) || (1 == best_mode))
        {
            indexPrec = 1;
        }
        else if (6 == best_mode)
        {
            indexPrec = 0;
        }
        else if (4 == best_mode)
        {
            if (0 == best_index_selector)
            {
                indexPrec = CGU_Vec2ui(2, 1);
            }
            else
            {
                indexPrec = CGU_Vec2ui(1, 2);
            }
        }
        else
        {
            indexPrec = 2;
        }

        CGU_INT subset_index;
        if ((0 == best_mode) || (2 == best_mode))
        {
            subset_index = (bits2 >> (threadInBlock * 2)) & 0x03;
        }
        else if ((1 == best_mode) || (3 == best_mode) || (7 == best_mode))
        {
            subset_index = (bits >> threadInBlock) & 0x01;
        }
        else
        {
            subset_index = 0;
        }

        epTemp[0] = shared_temp[threadBase + subset_index].endPoint_low;
        epTemp[1] = shared_temp[threadBase + subset_index].endPoint_high;

        CGU_Vec4i span = cmp_castimp(epTemp[1] - epTemp[0]);
        if (best_mode < 4)
        {
            span.w = 0;
        }

        if ((4 == best_mode) || (5 == best_mode))
        {
            CGU_Vec2i span_norm_sqr;
            span_norm_sqr.x = dot( span.rgb, span.rgb );
            span_norm_sqr.y = span.a * span.a;
            diff3 = shared_temp[threadBase + threadInBlock].pixel.rgb - epTemp[0].rgb;
            CGU_INT dotProduct = dot( span.rgb, diff3 );
            color_index = ( span_norm_sqr.x <= 0 || dotProduct <= 0 ) ? 0
                    : ( ( dotProduct < span_norm_sqr.x ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.x ) ] : aStep[indexPrec.x][63] );

            CGU_UINT32 diffa = shared_temp[threadBase + threadInBlock].pixel.a - epTemp[0].a;
            dotProduct = dot( span.a, diffa );
            alpha_index = ( span_norm_sqr.y <= 0 || dotProduct <= 0 ) ? 0
                    : ( ( dotProduct < span_norm_sqr.y ) ? aStep[indexPrec.y][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr.y ) ] : aStep[indexPrec.y][63] );

            if (best_index_selector)
            {
                swap(color_index, alpha_index);
            }
        }
        else
        {
            CGU_INT span_norm_sqr = dot( span, span );
            diff4 = shared_temp[threadBase + threadInBlock].pixel - epTemp[0] ;
            CGU_INT dotProduct = dot( span, diff4);
            color_index = ( span_norm_sqr <= 0 || dotProduct <= 0 ) ? 0
                    : ( ( dotProduct < span_norm_sqr ) ? aStep[indexPrec.x][ CGU_UINT32( dotProduct * 63.49999 / span_norm_sqr ) ] : aStep[indexPrec.x][63] );
        }

        shared_temp[GI].error = color_index;
        shared_temp[GI].mode = alpha_index;
    }
    GroupSync();

    if (0 == threadInBlock)
    {
        CGU_Vec4ui blockRed  = {0x001fffc0, 0xfffe0000, 0x00000001, 0x00000000};
        CGU_Vec4ui blockBlue = {0x00000040, 0xfffffff8, 0x00000001, 0x00000000};

        CGU_Vec4ui block     = {0, 0, 0, 0};

        switch (best_mode)
        {
        case 0:
            block_package0(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 1:
            block_package1(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 2:
            block_package2(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 3:
            block_package3(block, best_partition, threadBase);
            //block = blockRed;
            break;
        case 4:
            block_package4(block, best_rotation, best_index_selector, threadBase);
            //block = blockRed;
            break;
        case 5:
            block_package5(block, best_rotation, threadBase);
            //block = blockRed;
            break;
        case 6:
            if (use_cmp) {
                block = g_InBuff[blockID].data2;
                //block = blockBlue;
            }
            else {
                block_package6( block, threadBase );
                //block = blockRed;
            }
            break;
        case 7:
            block_package7(block, best_partition, threadBase);
            //block = blockRed;
            break;
        default:  // error!
            block = blockRed;
            break;
        }

        g_OutBuff[blockID] = block;
    }
}

//=================================================
// This is a prototype API interface to run on CPU
// move to GPU when completed
//=================================================
CMP_STATIC CGU_Vec4ui CompressBlockBC7_CMPMSC(CMP_IN CGU_Vec4f image_src[16], CMP_IN CGU_FLOAT fquality)
{
    CMP_UNUSED(fquality);

    CGU_Vec4ui cmp = {0, 0, 0, 0};

#ifndef ASPM_HLSL
#ifdef SIMULATE_GPU
        HLSLHost(image_src);
        cmp = g_OutBuff[0];
#else
        CGU_Vec4ui image_srcui[16];
        // Transfer local pixel data over to shared global
        for (CGU_INT ii = 0; ii < 16; ii++)
        {
            image_srcui[ii].x = image_src[ii].x;
            image_srcui[ii].y = image_src[ii].y;
            image_srcui[ii].z = image_src[ii].z;
            image_srcui[ii].w = image_src[ii].w;
        }

#if defined (ENABLE_CMP_MODE6)
        CGU_Vec4ui epo_code_out[2]     = {{0, 0, 0, 0}, {0, 0, 0, 0}};
        CGU_UINT32  best_index_out[16];
        CGU_FLOAT   besterr;
        CGU_FLOAT   err;

        // Fast Encode of block
        besterr = cmp_GetIndexedEndPoints(epo_code_out,
                                          best_index_out,
                                          image_srcui,
                                          15,  // numEntries 0..15 (Note this function is changed from using 16)
                                          0xffffffff);
        CGU_UINT32 index_packed_out[2] = {0, 0};
        cmp_pack4bitindex32(index_packed_out, best_index_out);

#ifdef ENABLE_CMP_REFINE_MODE6_API
        // Refined for better quailty
        err = cmp_mode6_optimize_IndexAndEndPoints(epo_code_out,
                                             best_index_out,
                                             image_srcui,                     // using shared_temp[].pixel with 0 thread offset
                                             16,                              // numEntries
                                             g_modesettings[6].clusters,      // 16,
                                             g_modesettings[6].bits,          // 58,
                                             g_modesettings[6].channels3or4,  // 4,
                                             0.1f);
        cmp_pack4bitindex32(index_packed_out, best_index_out);
#endif

        // encode results
        CGU_UINT32 cmp_out6[4] = {0, 0, 0, 0};
        cmp_encode_mode6(cmp_out6, epo_code_out, index_packed_out);
        cmp.x = cmp_out6[0];
        cmp.y = cmp_out6[1];
        cmp.z = cmp_out6[2];
        cmp.w = cmp_out6[3];
#endif

#if defined (ENABLE_CMP_MODE4) || defined(ENABLE_CMP_MODE5)
    {
        CGU_UINT32 cmp_out[4] = {0, 0, 0, 0}; 
        Compress_mode45(cmp_out, 4, image_srcui);
        cmp.x = cmp_out[0];
        cmp.y = cmp_out[1];
        cmp.z = cmp_out[2];
        cmp.w = cmp_out[3];
    }
#endif

#if defined(ENABLE_CMP_MODE1)
    {
        CGU_UINT32 cmp_out1[5] = {0, 0, 0, 0, 0}; 
        cmp_process_mode(cmp_out1, image_srcui, 1);
        cmp.x = cmp_out1[0];
        cmp.y = cmp_out1[1];
        cmp.z = cmp_out1[2];
        cmp.w = cmp_out1[3];
    }
#endif

#endif // SIMULATE_GPU
#endif // Not HLSL

    return cmp;
}

